diff --git "a/checkpoint-1771/trainer_state.json" "b/checkpoint-1771/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1771/trainer_state.json" @@ -0,0 +1,13183 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.37489415749364946, + "global_step": 1771, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.0416666666666666e-05, + "loss": 3.1158, + "theoretical_loss": 3.321573280713233, + "tokens_seen": 2990473216 + }, + { + "epoch": 0.0, + "learning_rate": 2.0833333333333333e-05, + "loss": 2.8674, + "theoretical_loss": 3.321567680436603, + "tokens_seen": 2990538752 + }, + { + "epoch": 0.0, + "learning_rate": 3.125e-05, + "loss": 3.1083, + "theoretical_loss": 3.321562080317061, + "tokens_seen": 2990604288 + }, + { + "epoch": 0.0, + "learning_rate": 4.1666666666666665e-05, + "loss": 2.8175, + "theoretical_loss": 3.3215564803546, + "tokens_seen": 2990669824 + }, + { + "epoch": 0.0, + "learning_rate": 5.208333333333334e-05, + "loss": 2.8746, + "theoretical_loss": 3.321550880549211, + "tokens_seen": 2990735360 + }, + { + "epoch": 0.0, + "learning_rate": 6.25e-05, + "loss": 2.6234, + "theoretical_loss": 3.321545280900887, + "tokens_seen": 2990800896 + }, + { + "epoch": 0.0, + "learning_rate": 7.291666666666667e-05, + "loss": 2.6986, + "theoretical_loss": 3.32153968140962, + "tokens_seen": 2990866432 + }, + { + "epoch": 0.0, + "learning_rate": 8.333333333333333e-05, + "loss": 2.9684, + "theoretical_loss": 3.3215340820754022, + "tokens_seen": 2990931968 + }, + { + "epoch": 0.0, + "learning_rate": 9.375e-05, + "loss": 3.0289, + "theoretical_loss": 3.321528482898225, + "tokens_seen": 2990997504 + }, + { + "epoch": 0.0, + "learning_rate": 0.00010416666666666667, + "loss": 2.5923, + "theoretical_loss": 3.3215228838780817, + "tokens_seen": 2991063040 + }, + { + "epoch": 0.0, + "learning_rate": 0.00011458333333333333, + "loss": 2.7436, + "theoretical_loss": 3.3215172850149637, + "tokens_seen": 2991128576 + }, + { + "epoch": 0.0, + "learning_rate": 0.000125, + "loss": 2.7711, + "theoretical_loss": 3.3215116863088636, + "tokens_seen": 2991194112 + }, + { + "epoch": 0.0, + "learning_rate": 0.00013541666666666666, + "loss": 2.4174, + "theoretical_loss": 3.3215060877597735, + "tokens_seen": 2991259648 + }, + { + "epoch": 0.0, + "learning_rate": 0.00014583333333333335, + "loss": 2.5628, + "theoretical_loss": 3.3215004893676854, + "tokens_seen": 2991325184 + }, + { + "epoch": 0.0, + "learning_rate": 0.00015625, + "loss": 2.5322, + "theoretical_loss": 3.3214948911325908, + "tokens_seen": 2991390720 + }, + { + "epoch": 0.0, + "learning_rate": 0.00016666666666666666, + "loss": 2.5703, + "theoretical_loss": 3.321489293054483, + "tokens_seen": 2991456256 + }, + { + "epoch": 0.0, + "learning_rate": 0.00017708333333333335, + "loss": 2.4464, + "theoretical_loss": 3.3214836951333537, + "tokens_seen": 2991521792 + }, + { + "epoch": 0.0, + "learning_rate": 0.0001875, + "loss": 2.5979, + "theoretical_loss": 3.321478097369195, + "tokens_seen": 2991587328 + }, + { + "epoch": 0.0, + "learning_rate": 0.00019791666666666666, + "loss": 2.6103, + "theoretical_loss": 3.321472499761999, + "tokens_seen": 2991652864 + }, + { + "epoch": 0.0, + "learning_rate": 0.00020833333333333335, + "loss": 2.5729, + "theoretical_loss": 3.321466902311758, + "tokens_seen": 2991718400 + }, + { + "epoch": 0.0, + "learning_rate": 0.00021875, + "loss": 2.3812, + "theoretical_loss": 3.321461305018464, + "tokens_seen": 2991783936 + }, + { + "epoch": 0.0, + "learning_rate": 0.00022916666666666666, + "loss": 2.461, + "theoretical_loss": 3.3214557078821096, + "tokens_seen": 2991849472 + }, + { + "epoch": 0.0, + "learning_rate": 0.00023958333333333335, + "loss": 2.3162, + "theoretical_loss": 3.3214501109026866, + "tokens_seen": 2991915008 + }, + { + "epoch": 0.01, + "learning_rate": 0.00025, + "loss": 2.4302, + "theoretical_loss": 3.321444514080187, + "tokens_seen": 2991980544 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 1640731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.594472885131836, + "objective/train/theoretical_loss": 3.321438917414603, + "objective/train/tokens_used": 22097376, + "theoretical_loss": 3.321438917414603, + "tokens_seen": 2992046080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002604166666666667, + "loss": 2.6341, + "theoretical_loss": 3.321438917414603, + "tokens_seen": 2992046080 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002708333333333333, + "loss": 2.6529, + "theoretical_loss": 3.321433320905927, + "tokens_seen": 2992111616 + }, + { + "epoch": 0.01, + "learning_rate": 0.00028125000000000003, + "loss": 2.6057, + "theoretical_loss": 3.3214277245541513, + "tokens_seen": 2992177152 + }, + { + "epoch": 0.01, + "learning_rate": 0.0002916666666666667, + "loss": 2.6216, + "theoretical_loss": 3.3214221283592678, + "tokens_seen": 2992242688 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003020833333333333, + "loss": 2.4586, + "theoretical_loss": 3.321416532321269, + "tokens_seen": 2992308224 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003125, + "loss": 2.4143, + "theoretical_loss": 3.321410936440146, + "tokens_seen": 2992373760 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003229166666666667, + "loss": 2.6421, + "theoretical_loss": 3.3214053407158923, + "tokens_seen": 2992439296 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003333333333333333, + "loss": 2.4524, + "theoretical_loss": 3.3213997451485, + "tokens_seen": 2992504832 + }, + { + "epoch": 0.01, + "learning_rate": 0.00034375, + "loss": 2.4758, + "theoretical_loss": 3.32139414973796, + "tokens_seen": 2992570368 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003541666666666667, + "loss": 2.4524, + "theoretical_loss": 3.3213885544842654, + "tokens_seen": 2992635904 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003645833333333333, + "loss": 2.7206, + "theoretical_loss": 3.3213829593874085, + "tokens_seen": 2992701440 + }, + { + "epoch": 0.01, + "learning_rate": 0.000375, + "loss": 2.5027, + "theoretical_loss": 3.321377364447381, + "tokens_seen": 2992766976 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003854166666666667, + "loss": 2.6288, + "theoretical_loss": 3.321371769664175, + "tokens_seen": 2992832512 + }, + { + "epoch": 0.01, + "learning_rate": 0.0003958333333333333, + "loss": 2.6262, + "theoretical_loss": 3.3213661750377836, + "tokens_seen": 2992898048 + }, + { + "epoch": 0.01, + "learning_rate": 0.00040625000000000004, + "loss": 2.4324, + "theoretical_loss": 3.321360580568198, + "tokens_seen": 2992963584 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004166666666666667, + "loss": 2.3967, + "theoretical_loss": 3.3213549862554106, + "tokens_seen": 2993029120 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004270833333333333, + "loss": 2.4334, + "theoretical_loss": 3.321349392099414, + "tokens_seen": 2993094656 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004375, + "loss": 2.5264, + "theoretical_loss": 3.3213437981001994, + "tokens_seen": 2993160192 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004479166666666667, + "loss": 2.3953, + "theoretical_loss": 3.32133820425776, + "tokens_seen": 2993225728 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004583333333333333, + "loss": 2.5172, + "theoretical_loss": 3.3213326105720875, + "tokens_seen": 2993291264 + }, + { + "epoch": 0.01, + "learning_rate": 0.00046875, + "loss": 2.6647, + "theoretical_loss": 3.321327017043174, + "tokens_seen": 2993356800 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004791666666666667, + "loss": 2.4897, + "theoretical_loss": 3.3213214236710122, + "tokens_seen": 2993422336 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004895833333333333, + "loss": 2.5138, + "theoretical_loss": 3.321315830455594, + "tokens_seen": 2993487872 + }, + { + "epoch": 0.01, + "learning_rate": 0.0005, + "loss": 2.372, + "theoretical_loss": 3.321310237396911, + "tokens_seen": 2993553408 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004998930710008554, + "loss": 2.3235, + "theoretical_loss": 3.321304644494956, + "tokens_seen": 2993618944 + }, + { + "epoch": 0.01, + "objective/train/docs_used": 1641796, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.1646158695220947, + "objective/train/theoretical_loss": 3.3212990517497207, + "objective/train/tokens_used": 23735776, + "theoretical_loss": 3.3212990517497207, + "tokens_seen": 2993684480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004997861420017108, + "loss": 2.7205, + "theoretical_loss": 3.3212990517497207, + "tokens_seen": 2993684480 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004996792130025663, + "loss": 2.6519, + "theoretical_loss": 3.3212934591611982, + "tokens_seen": 2993750016 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004995722840034218, + "loss": 2.5422, + "theoretical_loss": 3.3212878667293797, + "tokens_seen": 2993815552 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004994653550042771, + "loss": 2.6428, + "theoretical_loss": 3.321282274454258, + "tokens_seen": 2993881088 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004993584260051326, + "loss": 2.5495, + "theoretical_loss": 3.321276682335825, + "tokens_seen": 2993946624 + }, + { + "epoch": 0.01, + "learning_rate": 0.000499251497005988, + "loss": 2.5962, + "theoretical_loss": 3.321271090374073, + "tokens_seen": 2994012160 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004991445680068435, + "loss": 2.5919, + "theoretical_loss": 3.3212654985689936, + "tokens_seen": 2994077696 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004990376390076989, + "loss": 2.5942, + "theoretical_loss": 3.32125990692058, + "tokens_seen": 2994143232 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004989307100085543, + "loss": 2.2569, + "theoretical_loss": 3.3212543154288237, + "tokens_seen": 2994208768 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004988237810094098, + "loss": 2.446, + "theoretical_loss": 3.321248724093717, + "tokens_seen": 2994274304 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004987168520102651, + "loss": 2.5982, + "theoretical_loss": 3.3212431329152525, + "tokens_seen": 2994339840 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004986099230111207, + "loss": 2.3817, + "theoretical_loss": 3.3212375418934217, + "tokens_seen": 2994405376 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004985029940119761, + "loss": 2.5162, + "theoretical_loss": 3.321231951028217, + "tokens_seen": 2994470912 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004983960650128315, + "loss": 2.7158, + "theoretical_loss": 3.321226360319631, + "tokens_seen": 2994536448 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004982891360136869, + "loss": 2.555, + "theoretical_loss": 3.3212207697676552, + "tokens_seen": 2994601984 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004981822070145423, + "loss": 2.2944, + "theoretical_loss": 3.3212151793722824, + "tokens_seen": 2994667520 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004980752780153978, + "loss": 2.4699, + "theoretical_loss": 3.3212095891335043, + "tokens_seen": 2994733056 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004979683490162532, + "loss": 2.5024, + "theoretical_loss": 3.321203999051314, + "tokens_seen": 2994798592 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004978614200171087, + "loss": 2.5346, + "theoretical_loss": 3.321198409125702, + "tokens_seen": 2994864128 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004977544910179641, + "loss": 2.7699, + "theoretical_loss": 3.321192819356662, + "tokens_seen": 2994929664 + }, + { + "epoch": 0.01, + "learning_rate": 0.0004976475620188195, + "loss": 2.3356, + "theoretical_loss": 3.321187229744186, + "tokens_seen": 2994995200 + }, + { + "epoch": 0.02, + "learning_rate": 0.000497540633019675, + "loss": 2.4074, + "theoretical_loss": 3.3211816402882652, + "tokens_seen": 2995060736 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004974337040205304, + "loss": 2.6453, + "theoretical_loss": 3.321176050988893, + "tokens_seen": 2995126272 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004973267750213858, + "loss": 2.5405, + "theoretical_loss": 3.3211704618460614, + "tokens_seen": 2995191808 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004972198460222412, + "loss": 2.645, + "theoretical_loss": 3.3211648728597614, + "tokens_seen": 2995257344 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 1642448, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.835965394973755, + "objective/train/theoretical_loss": 3.3211592840299864, + "objective/train/tokens_used": 25374176, + "theoretical_loss": 3.3211592840299864, + "tokens_seen": 2995322880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004971129170230966, + "loss": 2.5601, + "theoretical_loss": 3.3211592840299864, + "tokens_seen": 2995322880 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004970059880239521, + "loss": 2.6971, + "theoretical_loss": 3.3211536953567284, + "tokens_seen": 2995388416 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004968990590248076, + "loss": 2.4257, + "theoretical_loss": 3.321148106839979, + "tokens_seen": 2995453952 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496792130025663, + "loss": 2.5419, + "theoretical_loss": 3.321142518479731, + "tokens_seen": 2995519488 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004966852010265184, + "loss": 2.7436, + "theoretical_loss": 3.321136930275977, + "tokens_seen": 2995585024 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004965782720273738, + "loss": 2.5704, + "theoretical_loss": 3.321131342228708, + "tokens_seen": 2995650560 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004964713430282293, + "loss": 2.8543, + "theoretical_loss": 3.321125754337917, + "tokens_seen": 2995716096 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004963644140290847, + "loss": 2.5733, + "theoretical_loss": 3.321120166603596, + "tokens_seen": 2995781632 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004962574850299401, + "loss": 2.5429, + "theoretical_loss": 3.3211145790257373, + "tokens_seen": 2995847168 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004961505560307955, + "loss": 2.7132, + "theoretical_loss": 3.3211089916043326, + "tokens_seen": 2995912704 + }, + { + "epoch": 0.02, + "learning_rate": 0.000496043627031651, + "loss": 2.4883, + "theoretical_loss": 3.3211034043393743, + "tokens_seen": 2995978240 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004959366980325064, + "loss": 2.5549, + "theoretical_loss": 3.3210978172308554, + "tokens_seen": 2996043776 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004958297690333619, + "loss": 2.7343, + "theoretical_loss": 3.3210922302787673, + "tokens_seen": 2996109312 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004957228400342173, + "loss": 2.5603, + "theoretical_loss": 3.3210866434831026, + "tokens_seen": 2996174848 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004956159110350727, + "loss": 2.7516, + "theoretical_loss": 3.3210810568438527, + "tokens_seen": 2996240384 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004955089820359281, + "loss": 2.4148, + "theoretical_loss": 3.3210754703610106, + "tokens_seen": 2996305920 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004954020530367836, + "loss": 2.5289, + "theoretical_loss": 3.3210698840345687, + "tokens_seen": 2996371456 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004952951240376391, + "loss": 2.4535, + "theoretical_loss": 3.321064297864518, + "tokens_seen": 2996436992 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004951881950384944, + "loss": 2.3, + "theoretical_loss": 3.3210587118508523, + "tokens_seen": 2996502528 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004950812660393499, + "loss": 2.6429, + "theoretical_loss": 3.3210531259935627, + "tokens_seen": 2996568064 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004949743370402053, + "loss": 2.4731, + "theoretical_loss": 3.3210475402926414, + "tokens_seen": 2996633600 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004948674080410608, + "loss": 2.636, + "theoretical_loss": 3.321041954748081, + "tokens_seen": 2996699136 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004947604790419162, + "loss": 2.5385, + "theoretical_loss": 3.3210363693598737, + "tokens_seen": 2996764672 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004946535500427716, + "loss": 2.4859, + "theoretical_loss": 3.321030784128012, + "tokens_seen": 2996830208 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494546621043627, + "loss": 2.5137, + "theoretical_loss": 3.321025199052487, + "tokens_seen": 2996895744 + }, + { + "epoch": 0.02, + "objective/train/docs_used": 1643593, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5586180686950684, + "objective/train/theoretical_loss": 3.321019614133292, + "objective/train/tokens_used": 27012576, + "theoretical_loss": 3.321019614133292, + "tokens_seen": 2996961280 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004944396920444824, + "loss": 2.5684, + "theoretical_loss": 3.321019614133292, + "tokens_seen": 2996961280 + }, + { + "epoch": 0.02, + "learning_rate": 0.000494332763045338, + "loss": 2.5426, + "theoretical_loss": 3.321014029370419, + "tokens_seen": 2997026816 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004942258340461933, + "loss": 2.6556, + "theoretical_loss": 3.3210084447638595, + "tokens_seen": 2997092352 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004941189050470488, + "loss": 2.668, + "theoretical_loss": 3.3210028603136066, + "tokens_seen": 2997157888 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004940119760479042, + "loss": 2.544, + "theoretical_loss": 3.320997276019652, + "tokens_seen": 2997223424 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004939050470487596, + "loss": 2.4703, + "theoretical_loss": 3.320991691881988, + "tokens_seen": 2997288960 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493798118049615, + "loss": 2.4878, + "theoretical_loss": 3.3209861079006067, + "tokens_seen": 2997354496 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004936911890504705, + "loss": 2.5617, + "theoretical_loss": 3.320980524075501, + "tokens_seen": 2997420032 + }, + { + "epoch": 0.02, + "learning_rate": 0.000493584260051326, + "loss": 2.6501, + "theoretical_loss": 3.320974940406662, + "tokens_seen": 2997485568 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004934773310521813, + "loss": 2.4307, + "theoretical_loss": 3.3209693568940826, + "tokens_seen": 2997551104 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004933704020530368, + "loss": 2.4372, + "theoretical_loss": 3.320963773537755, + "tokens_seen": 2997616640 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004932634730538923, + "loss": 2.5929, + "theoretical_loss": 3.3209581903376715, + "tokens_seen": 2997682176 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004931565440547477, + "loss": 2.3358, + "theoretical_loss": 3.320952607293824, + "tokens_seen": 2997747712 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004930496150556031, + "loss": 2.6659, + "theoretical_loss": 3.3209470244062045, + "tokens_seen": 2997813248 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004929426860564585, + "loss": 2.5873, + "theoretical_loss": 3.320941441674806, + "tokens_seen": 2997878784 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004928357570573139, + "loss": 2.5936, + "theoretical_loss": 3.32093585909962, + "tokens_seen": 2997944320 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004927288280581693, + "loss": 2.5437, + "theoretical_loss": 3.320930276680639, + "tokens_seen": 2998009856 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004926218990590249, + "loss": 2.5207, + "theoretical_loss": 3.320924694417855, + "tokens_seen": 2998075392 + }, + { + "epoch": 0.02, + "learning_rate": 0.0004925149700598803, + "loss": 2.624, + "theoretical_loss": 3.3209191123112607, + "tokens_seen": 2998140928 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004924080410607357, + "loss": 2.6163, + "theoretical_loss": 3.3209135303608477, + "tokens_seen": 2998206464 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004923011120615911, + "loss": 2.502, + "theoretical_loss": 3.320907948566609, + "tokens_seen": 2998272000 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004921941830624465, + "loss": 2.4689, + "theoretical_loss": 3.320902366928536, + "tokens_seen": 2998337536 + }, + { + "epoch": 0.03, + "learning_rate": 0.000492087254063302, + "loss": 2.7454, + "theoretical_loss": 3.3208967854466214, + "tokens_seen": 2998403072 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004919803250641574, + "loss": 2.601, + "theoretical_loss": 3.320891204120857, + "tokens_seen": 2998468608 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004918733960650129, + "loss": 2.2926, + "theoretical_loss": 3.3208856229512356, + "tokens_seen": 2998534144 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 1644132, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.451066732406616, + "objective/train/theoretical_loss": 3.320880041937749, + "objective/train/tokens_used": 28650976, + "theoretical_loss": 3.320880041937749, + "tokens_seen": 2998599680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004917664670658682, + "loss": 2.6377, + "theoretical_loss": 3.320880041937749, + "tokens_seen": 2998599680 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004916595380667237, + "loss": 2.5554, + "theoretical_loss": 3.3208744610803898, + "tokens_seen": 2998665216 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004915526090675792, + "loss": 2.6001, + "theoretical_loss": 3.3208688803791495, + "tokens_seen": 2998730752 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004914456800684346, + "loss": 2.5503, + "theoretical_loss": 3.320863299834021, + "tokens_seen": 2998796288 + }, + { + "epoch": 0.03, + "learning_rate": 0.00049133875106929, + "loss": 2.5515, + "theoretical_loss": 3.3208577194449966, + "tokens_seen": 2998861824 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004912318220701454, + "loss": 2.5162, + "theoretical_loss": 3.320852139212068, + "tokens_seen": 2998927360 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004911248930710008, + "loss": 2.5311, + "theoretical_loss": 3.3208465591352274, + "tokens_seen": 2998992896 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004910179640718563, + "loss": 2.4144, + "theoretical_loss": 3.3208409792144677, + "tokens_seen": 2999058432 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004909110350727117, + "loss": 2.6587, + "theoretical_loss": 3.3208353994497806, + "tokens_seen": 2999123968 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004908041060735672, + "loss": 2.4581, + "theoretical_loss": 3.320829819841158, + "tokens_seen": 2999189504 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004906971770744226, + "loss": 2.5588, + "theoretical_loss": 3.320824240388593, + "tokens_seen": 2999255040 + }, + { + "epoch": 0.03, + "learning_rate": 0.000490590248075278, + "loss": 2.4129, + "theoretical_loss": 3.320818661092077, + "tokens_seen": 2999320576 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004904833190761335, + "loss": 2.5726, + "theoretical_loss": 3.320813081951603, + "tokens_seen": 2999386112 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004903763900769889, + "loss": 2.6172, + "theoretical_loss": 3.3208075029671624, + "tokens_seen": 2999451648 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004902694610778443, + "loss": 2.6785, + "theoretical_loss": 3.320801924138748, + "tokens_seen": 2999517184 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004901625320786997, + "loss": 2.6485, + "theoretical_loss": 3.320796345466352, + "tokens_seen": 2999582720 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004900556030795552, + "loss": 2.5381, + "theoretical_loss": 3.3207907669499663, + "tokens_seen": 2999648256 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004899486740804106, + "loss": 2.5339, + "theoretical_loss": 3.320785188589584, + "tokens_seen": 2999713792 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004898417450812661, + "loss": 2.6513, + "theoretical_loss": 3.320779610385196, + "tokens_seen": 2999779328 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004897348160821215, + "loss": 2.6681, + "theoretical_loss": 3.3207740323367956, + "tokens_seen": 2999844864 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004896278870829769, + "loss": 2.6658, + "theoretical_loss": 3.320768454444374, + "tokens_seen": 2999910400 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004895209580838323, + "loss": 2.6133, + "theoretical_loss": 3.3207628767079242, + "tokens_seen": 2999975936 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004894140290846878, + "loss": 2.3937, + "theoretical_loss": 3.3207572991274388, + "tokens_seen": 3000041472 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004893071000855432, + "loss": 2.433, + "theoretical_loss": 3.3207517217029094, + "tokens_seen": 3000107008 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004892001710863986, + "loss": 2.5588, + "theoretical_loss": 3.3207461444343283, + "tokens_seen": 3000172544 + }, + { + "epoch": 0.03, + "objective/train/docs_used": 1645467, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7996082305908203, + "objective/train/theoretical_loss": 3.3207405673216877, + "objective/train/tokens_used": 30289376, + "theoretical_loss": 3.3207405673216877, + "tokens_seen": 3000238080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004890932420872541, + "loss": 2.7605, + "theoretical_loss": 3.3207405673216877, + "tokens_seen": 3000238080 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004889863130881094, + "loss": 2.6024, + "theoretical_loss": 3.3207349903649797, + "tokens_seen": 3000303616 + }, + { + "epoch": 0.03, + "learning_rate": 0.000488879384088965, + "loss": 2.4747, + "theoretical_loss": 3.320729413564197, + "tokens_seen": 3000369152 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004887724550898204, + "loss": 2.5954, + "theoretical_loss": 3.3207238369193313, + "tokens_seen": 3000434688 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004886655260906758, + "loss": 2.6027, + "theoretical_loss": 3.3207182604303753, + "tokens_seen": 3000500224 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004885585970915312, + "loss": 2.6733, + "theoretical_loss": 3.3207126840973213, + "tokens_seen": 3000565760 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004884516680923866, + "loss": 2.5098, + "theoretical_loss": 3.320707107920161, + "tokens_seen": 3000631296 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004883447390932422, + "loss": 2.4302, + "theoretical_loss": 3.3207015318988873, + "tokens_seen": 3000696832 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004882378100940975, + "loss": 2.5576, + "theoretical_loss": 3.3206959560334917, + "tokens_seen": 3000762368 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048813088109495294, + "loss": 2.4798, + "theoretical_loss": 3.320690380323967, + "tokens_seen": 3000827904 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004880239520958084, + "loss": 2.3385, + "theoretical_loss": 3.320684804770305, + "tokens_seen": 3000893440 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004879170230966638, + "loss": 2.4282, + "theoretical_loss": 3.3206792293724985, + "tokens_seen": 3000958976 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048781009409751925, + "loss": 2.5659, + "theoretical_loss": 3.3206736541305393, + "tokens_seen": 3001024512 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004877031650983747, + "loss": 2.7023, + "theoretical_loss": 3.3206680790444194, + "tokens_seen": 3001090048 + }, + { + "epoch": 0.03, + "learning_rate": 0.0004875962360992301, + "loss": 2.4634, + "theoretical_loss": 3.3206625041141318, + "tokens_seen": 3001155584 + }, + { + "epoch": 0.03, + "learning_rate": 0.00048748930710008556, + "loss": 2.7005, + "theoretical_loss": 3.3206569293396684, + "tokens_seen": 3001221120 + }, + { + "epoch": 0.04, + "learning_rate": 0.000487382378100941, + "loss": 2.6176, + "theoretical_loss": 3.3206513547210212, + "tokens_seen": 3001286656 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048727544910179645, + "loss": 2.4467, + "theoretical_loss": 3.320645780258183, + "tokens_seen": 3001352192 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004871685201026518, + "loss": 2.5512, + "theoretical_loss": 3.320640205951145, + "tokens_seen": 3001417728 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004870615911035073, + "loss": 2.5798, + "theoretical_loss": 3.3206346317999005, + "tokens_seen": 3001483264 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004869546621043627, + "loss": 2.6427, + "theoretical_loss": 3.3206290578044415, + "tokens_seen": 3001548800 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048684773310521813, + "loss": 2.596, + "theoretical_loss": 3.32062348396476, + "tokens_seen": 3001614336 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004867408041060736, + "loss": 2.6275, + "theoretical_loss": 3.3206179102808484, + "tokens_seen": 3001679872 + }, + { + "epoch": 0.04, + "learning_rate": 0.000486633875106929, + "loss": 2.6446, + "theoretical_loss": 3.3206123367526987, + "tokens_seen": 3001745408 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048652694610778444, + "loss": 2.248, + "theoretical_loss": 3.3206067633803036, + "tokens_seen": 3001810944 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 1646713, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3031907081604004, + "objective/train/theoretical_loss": 3.320601190163655, + "objective/train/tokens_used": 31927776, + "theoretical_loss": 3.320601190163655, + "tokens_seen": 3001876480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048642001710863986, + "loss": 2.5457, + "theoretical_loss": 3.320601190163655, + "tokens_seen": 3001876480 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048631308810949533, + "loss": 2.6029, + "theoretical_loss": 3.320595617102745, + "tokens_seen": 3001942016 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004862061591103507, + "loss": 2.4349, + "theoretical_loss": 3.320590044197566, + "tokens_seen": 3002007552 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048609923011120617, + "loss": 2.6641, + "theoretical_loss": 3.320584471448111, + "tokens_seen": 3002073088 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004859923011120616, + "loss": 2.6479, + "theoretical_loss": 3.320578898854371, + "tokens_seen": 3002138624 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048588537211291706, + "loss": 2.6043, + "theoretical_loss": 3.3205733264163393, + "tokens_seen": 3002204160 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004857784431137724, + "loss": 2.5969, + "theoretical_loss": 3.3205677541340073, + "tokens_seen": 3002269696 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004856715141146279, + "loss": 2.5909, + "theoretical_loss": 3.320562182007368, + "tokens_seen": 3002335232 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048556458511548337, + "loss": 2.6341, + "theoretical_loss": 3.3205566100364132, + "tokens_seen": 3002400768 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048545765611633873, + "loss": 2.5535, + "theoretical_loss": 3.320551038221135, + "tokens_seen": 3002466304 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004853507271171942, + "loss": 2.6686, + "theoretical_loss": 3.320545466561526, + "tokens_seen": 3002531840 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004852437981180496, + "loss": 2.4385, + "theoretical_loss": 3.3205398950575784, + "tokens_seen": 3002597376 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048513686911890504, + "loss": 2.2543, + "theoretical_loss": 3.3205343237092846, + "tokens_seen": 3002662912 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048502994011976046, + "loss": 2.2638, + "theoretical_loss": 3.320528752516636, + "tokens_seen": 3002728448 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048492301112061594, + "loss": 2.6381, + "theoretical_loss": 3.320523181479626, + "tokens_seen": 3002793984 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048481608212147135, + "loss": 2.3925, + "theoretical_loss": 3.3205176105982463, + "tokens_seen": 3002859520 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048470915312232677, + "loss": 2.4351, + "theoretical_loss": 3.320512039872489, + "tokens_seen": 3002925056 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048460222412318225, + "loss": 2.5748, + "theoretical_loss": 3.320506469302347, + "tokens_seen": 3002990592 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048449529512403766, + "loss": 2.6547, + "theoretical_loss": 3.320500898887812, + "tokens_seen": 3003056128 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004843883661248931, + "loss": 2.5674, + "theoretical_loss": 3.3204953286288763, + "tokens_seen": 3003121664 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004842814371257485, + "loss": 2.8503, + "theoretical_loss": 3.3204897585255324, + "tokens_seen": 3003187200 + }, + { + "epoch": 0.04, + "learning_rate": 0.000484174508126604, + "loss": 2.3909, + "theoretical_loss": 3.3204841885777725, + "tokens_seen": 3003252736 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048406757912745934, + "loss": 2.6026, + "theoretical_loss": 3.3204786187855886, + "tokens_seen": 3003318272 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004839606501283148, + "loss": 2.3427, + "theoretical_loss": 3.3204730491489727, + "tokens_seen": 3003383808 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048385372112917023, + "loss": 2.5079, + "theoretical_loss": 3.320467479667918, + "tokens_seen": 3003449344 + }, + { + "epoch": 0.04, + "objective/train/docs_used": 1647443, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.66249418258667, + "objective/train/theoretical_loss": 3.3204619103424164, + "objective/train/tokens_used": 33566176, + "theoretical_loss": 3.3204619103424164, + "tokens_seen": 3003514880 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048374679213002565, + "loss": 2.5391, + "theoretical_loss": 3.3204619103424164, + "tokens_seen": 3003514880 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004836398631308811, + "loss": 2.5985, + "theoretical_loss": 3.3204563411724597, + "tokens_seen": 3003580416 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048353293413173654, + "loss": 2.5948, + "theoretical_loss": 3.3204507721580403, + "tokens_seen": 3003645952 + }, + { + "epoch": 0.04, + "learning_rate": 0.000483426005132592, + "loss": 2.3641, + "theoretical_loss": 3.320445203299151, + "tokens_seen": 3003711488 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004833190761334474, + "loss": 2.607, + "theoretical_loss": 3.3204396345957834, + "tokens_seen": 3003777024 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048321214713430285, + "loss": 2.6218, + "theoretical_loss": 3.3204340660479303, + "tokens_seen": 3003842560 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048310521813515827, + "loss": 2.7135, + "theoretical_loss": 3.320428497655584, + "tokens_seen": 3003908096 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004829982891360137, + "loss": 2.7583, + "theoretical_loss": 3.320422929418736, + "tokens_seen": 3003973632 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004828913601368691, + "loss": 2.3356, + "theoretical_loss": 3.320417361337379, + "tokens_seen": 3004039168 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004827844311377246, + "loss": 2.6196, + "theoretical_loss": 3.320411793411506, + "tokens_seen": 3004104704 + }, + { + "epoch": 0.04, + "learning_rate": 0.00048267750213858, + "loss": 2.5513, + "theoretical_loss": 3.3204062256411078, + "tokens_seen": 3004170240 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004825705731394354, + "loss": 2.3649, + "theoretical_loss": 3.320400658026178, + "tokens_seen": 3004235776 + }, + { + "epoch": 0.04, + "learning_rate": 0.0004824636441402909, + "loss": 2.5991, + "theoretical_loss": 3.320395090566708, + "tokens_seen": 3004301312 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048235671514114625, + "loss": 2.5668, + "theoretical_loss": 3.3203895232626905, + "tokens_seen": 3004366848 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004822497861420017, + "loss": 2.6184, + "theoretical_loss": 3.3203839561141173, + "tokens_seen": 3004432384 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048214285714285715, + "loss": 2.4345, + "theoretical_loss": 3.3203783891209815, + "tokens_seen": 3004497920 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004820359281437126, + "loss": 2.702, + "theoretical_loss": 3.320372822283275, + "tokens_seen": 3004563456 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481928999144568, + "loss": 2.4543, + "theoretical_loss": 3.3203672556009898, + "tokens_seen": 3004628992 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048182207014542346, + "loss": 2.7525, + "theoretical_loss": 3.3203616890741183, + "tokens_seen": 3004694528 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048171514114627893, + "loss": 2.5572, + "theoretical_loss": 3.320356122702653, + "tokens_seen": 3004760064 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004816082121471343, + "loss": 2.4409, + "theoretical_loss": 3.3203505564865856, + "tokens_seen": 3004825600 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048150128314798977, + "loss": 2.5298, + "theoretical_loss": 3.320344990425909, + "tokens_seen": 3004891136 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004813943541488452, + "loss": 2.5959, + "theoretical_loss": 3.3203394245206153, + "tokens_seen": 3004956672 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004812874251497006, + "loss": 2.5989, + "theoretical_loss": 3.3203338587706965, + "tokens_seen": 3005022208 + }, + { + "epoch": 0.05, + "learning_rate": 0.000481180496150556, + "loss": 2.3394, + "theoretical_loss": 3.320328293176145, + "tokens_seen": 3005087744 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 1647973, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.042917013168335, + "objective/train/theoretical_loss": 3.3203227277369534, + "objective/train/tokens_used": 35204576, + "theoretical_loss": 3.3203227277369534, + "tokens_seen": 3005153280 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004810735671514115, + "loss": 2.4204, + "theoretical_loss": 3.3203227277369534, + "tokens_seen": 3005153280 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048096663815226686, + "loss": 2.4667, + "theoretical_loss": 3.320317162453114, + "tokens_seen": 3005218816 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048085970915312233, + "loss": 2.6191, + "theoretical_loss": 3.3203115973246184, + "tokens_seen": 3005284352 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048075278015397775, + "loss": 2.6692, + "theoretical_loss": 3.3203060323514593, + "tokens_seen": 3005349888 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004806458511548332, + "loss": 2.6315, + "theoretical_loss": 3.320300467533629, + "tokens_seen": 3005415424 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048053892215568864, + "loss": 2.3996, + "theoretical_loss": 3.3202949028711197, + "tokens_seen": 3005480960 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048043199315654406, + "loss": 2.6156, + "theoretical_loss": 3.320289338363924, + "tokens_seen": 3005546496 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048032506415739953, + "loss": 2.6943, + "theoretical_loss": 3.3202837740120335, + "tokens_seen": 3005612032 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004802181351582549, + "loss": 2.5853, + "theoretical_loss": 3.320278209815441, + "tokens_seen": 3005677568 + }, + { + "epoch": 0.05, + "learning_rate": 0.00048011120615911037, + "loss": 2.7011, + "theoretical_loss": 3.3202726457741387, + "tokens_seen": 3005743104 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004800042771599658, + "loss": 2.6477, + "theoretical_loss": 3.320267081888119, + "tokens_seen": 3005808640 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004798973481608212, + "loss": 2.5143, + "theoretical_loss": 3.320261518157374, + "tokens_seen": 3005874176 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004797904191616766, + "loss": 2.5315, + "theoretical_loss": 3.3202559545818957, + "tokens_seen": 3005939712 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004796834901625321, + "loss": 2.7104, + "theoretical_loss": 3.3202503911616765, + "tokens_seen": 3006005248 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047957656116338757, + "loss": 2.313, + "theoretical_loss": 3.320244827896709, + "tokens_seen": 3006070784 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047946963216424294, + "loss": 2.346, + "theoretical_loss": 3.320239264786986, + "tokens_seen": 3006136320 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004793627031650984, + "loss": 2.4753, + "theoretical_loss": 3.3202337018324983, + "tokens_seen": 3006201856 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047925577416595383, + "loss": 2.6649, + "theoretical_loss": 3.3202281390332393, + "tokens_seen": 3006267392 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047914884516680925, + "loss": 2.4813, + "theoretical_loss": 3.3202225763892015, + "tokens_seen": 3006332928 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047904191616766467, + "loss": 2.581, + "theoretical_loss": 3.320217013900376, + "tokens_seen": 3006398464 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047893498716852014, + "loss": 2.5711, + "theoretical_loss": 3.320211451566756, + "tokens_seen": 3006464000 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004788280581693755, + "loss": 2.5481, + "theoretical_loss": 3.3202058893883333, + "tokens_seen": 3006529536 + }, + { + "epoch": 0.05, + "learning_rate": 0.000478721129170231, + "loss": 2.7041, + "theoretical_loss": 3.320200327365101, + "tokens_seen": 3006595072 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047861420017108645, + "loss": 2.5201, + "theoretical_loss": 3.3201947654970505, + "tokens_seen": 3006660608 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004785072711719418, + "loss": 2.6215, + "theoretical_loss": 3.3201892037841745, + "tokens_seen": 3006726144 + }, + { + "epoch": 0.05, + "objective/train/docs_used": 1649106, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.610767364501953, + "objective/train/theoretical_loss": 3.320183642226465, + "objective/train/tokens_used": 36842976, + "theoretical_loss": 3.320183642226465, + "tokens_seen": 3006791680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004784003421727973, + "loss": 2.6786, + "theoretical_loss": 3.320183642226465, + "tokens_seen": 3006791680 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004782934131736527, + "loss": 2.6331, + "theoretical_loss": 3.3201780808239145, + "tokens_seen": 3006857216 + }, + { + "epoch": 0.05, + "learning_rate": 0.0004781864841745082, + "loss": 2.6597, + "theoretical_loss": 3.3201725195765155, + "tokens_seen": 3006922752 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047807955517536354, + "loss": 2.5184, + "theoretical_loss": 3.32016695848426, + "tokens_seen": 3006988288 + }, + { + "epoch": 0.05, + "learning_rate": 0.000477972626176219, + "loss": 2.6672, + "theoretical_loss": 3.3201613975471402, + "tokens_seen": 3007053824 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047786569717707443, + "loss": 2.6462, + "theoretical_loss": 3.3201558367651485, + "tokens_seen": 3007119360 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047775876817792985, + "loss": 2.4911, + "theoretical_loss": 3.3201502761382775, + "tokens_seen": 3007184896 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047765183917878527, + "loss": 2.6191, + "theoretical_loss": 3.3201447156665194, + "tokens_seen": 3007250432 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047754491017964074, + "loss": 2.4458, + "theoretical_loss": 3.320139155349866, + "tokens_seen": 3007315968 + }, + { + "epoch": 0.05, + "learning_rate": 0.00047743798118049616, + "loss": 2.3037, + "theoretical_loss": 3.32013359518831, + "tokens_seen": 3007381504 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004773310521813516, + "loss": 2.3071, + "theoretical_loss": 3.3201280351818436, + "tokens_seen": 3007447040 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047722412318220705, + "loss": 2.6427, + "theoretical_loss": 3.320122475330459, + "tokens_seen": 3007512576 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004771171941830624, + "loss": 2.5393, + "theoretical_loss": 3.320116915634149, + "tokens_seen": 3007578112 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004770102651839179, + "loss": 2.2973, + "theoretical_loss": 3.320111356092905, + "tokens_seen": 3007643648 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004769033361847733, + "loss": 2.7483, + "theoretical_loss": 3.3201057967067205, + "tokens_seen": 3007709184 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004767964071856288, + "loss": 2.5022, + "theoretical_loss": 3.320100237475587, + "tokens_seen": 3007774720 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047668947818648415, + "loss": 2.3979, + "theoretical_loss": 3.3200946783994962, + "tokens_seen": 3007840256 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004765825491873396, + "loss": 2.651, + "theoretical_loss": 3.320089119478441, + "tokens_seen": 3007905792 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004764756201881951, + "loss": 2.6367, + "theoretical_loss": 3.3200835607124146, + "tokens_seen": 3007971328 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047636869118905046, + "loss": 2.4628, + "theoretical_loss": 3.3200780021014085, + "tokens_seen": 3008036864 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047626176218990593, + "loss": 2.5774, + "theoretical_loss": 3.3200724436454143, + "tokens_seen": 3008102400 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047615483319076135, + "loss": 2.7166, + "theoretical_loss": 3.320066885344425, + "tokens_seen": 3008167936 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047604790419161677, + "loss": 2.5213, + "theoretical_loss": 3.3200613271984336, + "tokens_seen": 3008233472 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004759409751924722, + "loss": 2.6397, + "theoretical_loss": 3.320055769207431, + "tokens_seen": 3008299008 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047583404619332766, + "loss": 2.5147, + "theoretical_loss": 3.3200502113714108, + "tokens_seen": 3008364544 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 1649879, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4313642978668213, + "objective/train/theoretical_loss": 3.3200446536903643, + "objective/train/tokens_used": 38481376, + "theoretical_loss": 3.3200446536903643, + "tokens_seen": 3008430080 + }, + { + "epoch": 0.06, + "learning_rate": 0.000475727117194183, + "loss": 2.7199, + "theoretical_loss": 3.3200446536903643, + "tokens_seen": 3008430080 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004756201881950385, + "loss": 2.429, + "theoretical_loss": 3.3200390961642845, + "tokens_seen": 3008495616 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047551325919589397, + "loss": 2.7449, + "theoretical_loss": 3.320033538793163, + "tokens_seen": 3008561152 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004754063301967494, + "loss": 2.2979, + "theoretical_loss": 3.3200279815769926, + "tokens_seen": 3008626688 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004752994011976048, + "loss": 2.5655, + "theoretical_loss": 3.320022424515766, + "tokens_seen": 3008692224 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004751924721984602, + "loss": 2.3837, + "theoretical_loss": 3.3200168676094743, + "tokens_seen": 3008757760 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004750855431993157, + "loss": 2.3228, + "theoretical_loss": 3.320011310858111, + "tokens_seen": 3008823296 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047497861420017106, + "loss": 2.4002, + "theoretical_loss": 3.320005754261668, + "tokens_seen": 3008888832 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047487168520102653, + "loss": 2.5257, + "theoretical_loss": 3.320000197820137, + "tokens_seen": 3008954368 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047476475620188195, + "loss": 2.6074, + "theoretical_loss": 3.319994641533511, + "tokens_seen": 3009019904 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047465782720273737, + "loss": 2.6883, + "theoretical_loss": 3.3199890854017826, + "tokens_seen": 3009085440 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047455089820359284, + "loss": 2.3395, + "theoretical_loss": 3.319983529424943, + "tokens_seen": 3009150976 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047444396920444826, + "loss": 2.4104, + "theoretical_loss": 3.319977973602986, + "tokens_seen": 3009216512 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047433704020530374, + "loss": 2.4436, + "theoretical_loss": 3.3199724179359027, + "tokens_seen": 3009282048 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004742301112061591, + "loss": 2.3034, + "theoretical_loss": 3.3199668624236853, + "tokens_seen": 3009347584 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047412318220701457, + "loss": 2.4065, + "theoretical_loss": 3.319961307066327, + "tokens_seen": 3009413120 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047401625320787, + "loss": 2.5668, + "theoretical_loss": 3.31995575186382, + "tokens_seen": 3009478656 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004739093242087254, + "loss": 2.5791, + "theoretical_loss": 3.3199501968161558, + "tokens_seen": 3009544192 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047380239520958083, + "loss": 2.6598, + "theoretical_loss": 3.3199446419233274, + "tokens_seen": 3009609728 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004736954662104363, + "loss": 2.7689, + "theoretical_loss": 3.319939087185327, + "tokens_seen": 3009675264 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047358853721129167, + "loss": 2.6223, + "theoretical_loss": 3.319933532602147, + "tokens_seen": 3009740800 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047348160821214714, + "loss": 2.4611, + "theoretical_loss": 3.3199279781737796, + "tokens_seen": 3009806336 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004733746792130026, + "loss": 2.5834, + "theoretical_loss": 3.3199224239002167, + "tokens_seen": 3009871872 + }, + { + "epoch": 0.06, + "learning_rate": 0.000473267750213858, + "loss": 2.7345, + "theoretical_loss": 3.3199168697814514, + "tokens_seen": 3009937408 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047316082121471345, + "loss": 2.456, + "theoretical_loss": 3.3199113158174756, + "tokens_seen": 3010002944 + }, + { + "epoch": 0.06, + "objective/train/docs_used": 1651225, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.715864896774292, + "objective/train/theoretical_loss": 3.3199057620082812, + "objective/train/tokens_used": 40119776, + "theoretical_loss": 3.3199057620082812, + "tokens_seen": 3010068480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047305389221556887, + "loss": 2.441, + "theoretical_loss": 3.3199057620082812, + "tokens_seen": 3010068480 + }, + { + "epoch": 0.06, + "learning_rate": 0.00047294696321642434, + "loss": 2.4536, + "theoretical_loss": 3.3199002083538613, + "tokens_seen": 3010134016 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004728400342172797, + "loss": 2.6436, + "theoretical_loss": 3.319894654854208, + "tokens_seen": 3010199552 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004727331052181352, + "loss": 2.3205, + "theoretical_loss": 3.319889101509313, + "tokens_seen": 3010265088 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004726261762189906, + "loss": 2.6071, + "theoretical_loss": 3.3198835483191695, + "tokens_seen": 3010330624 + }, + { + "epoch": 0.06, + "learning_rate": 0.000472519247219846, + "loss": 2.6072, + "theoretical_loss": 3.319877995283769, + "tokens_seen": 3010396160 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004724123182207015, + "loss": 2.5943, + "theoretical_loss": 3.319872442403105, + "tokens_seen": 3010461696 + }, + { + "epoch": 0.06, + "learning_rate": 0.0004723053892215569, + "loss": 2.4161, + "theoretical_loss": 3.3198668896771686, + "tokens_seen": 3010527232 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004721984602224123, + "loss": 2.5087, + "theoretical_loss": 3.3198613371059524, + "tokens_seen": 3010592768 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047209153122326774, + "loss": 2.5444, + "theoretical_loss": 3.319855784689449, + "tokens_seen": 3010658304 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004719846022241232, + "loss": 2.6907, + "theoretical_loss": 3.319850232427651, + "tokens_seen": 3010723840 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004718776732249786, + "loss": 2.4364, + "theoretical_loss": 3.31984468032055, + "tokens_seen": 3010789376 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047177074422583405, + "loss": 2.3893, + "theoretical_loss": 3.3198391283681383, + "tokens_seen": 3010854912 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047166381522668947, + "loss": 2.5048, + "theoretical_loss": 3.3198335765704092, + "tokens_seen": 3010920448 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047155688622754495, + "loss": 2.6298, + "theoretical_loss": 3.3198280249273546, + "tokens_seen": 3010985984 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047144995722840036, + "loss": 2.8107, + "theoretical_loss": 3.319822473438966, + "tokens_seen": 3011051520 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004713430282292558, + "loss": 2.503, + "theoretical_loss": 3.319816922105237, + "tokens_seen": 3011117056 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047123609923011126, + "loss": 2.5555, + "theoretical_loss": 3.3198113709261587, + "tokens_seen": 3011182592 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004711291702309666, + "loss": 2.4985, + "theoretical_loss": 3.319805819901724, + "tokens_seen": 3011248128 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004710222412318221, + "loss": 2.4865, + "theoretical_loss": 3.3198002690319255, + "tokens_seen": 3011313664 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004709153122326775, + "loss": 2.5541, + "theoretical_loss": 3.3197947183167553, + "tokens_seen": 3011379200 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047080838323353293, + "loss": 2.4878, + "theoretical_loss": 3.319789167756206, + "tokens_seen": 3011444736 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047070145423438835, + "loss": 2.4241, + "theoretical_loss": 3.319783617350269, + "tokens_seen": 3011510272 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004705945252352438, + "loss": 2.6346, + "theoretical_loss": 3.3197780670989374, + "tokens_seen": 3011575808 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047048759623609924, + "loss": 2.3981, + "theoretical_loss": 3.319772517002204, + "tokens_seen": 3011641344 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 1651868, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2719619274139404, + "objective/train/theoretical_loss": 3.31976696706006, + "objective/train/tokens_used": 41758176, + "theoretical_loss": 3.31976696706006, + "tokens_seen": 3011706880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047038066723695466, + "loss": 2.4118, + "theoretical_loss": 3.31976696706006, + "tokens_seen": 3011706880 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047027373823781013, + "loss": 2.6711, + "theoretical_loss": 3.319761417272498, + "tokens_seen": 3011772416 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047016680923866555, + "loss": 2.6537, + "theoretical_loss": 3.319755867639511, + "tokens_seen": 3011837952 + }, + { + "epoch": 0.07, + "learning_rate": 0.00047005988023952097, + "loss": 2.6158, + "theoretical_loss": 3.319750318161091, + "tokens_seen": 3011903488 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004699529512403764, + "loss": 2.2521, + "theoretical_loss": 3.3197447688372295, + "tokens_seen": 3011969024 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046984602224123186, + "loss": 2.6413, + "theoretical_loss": 3.3197392196679205, + "tokens_seen": 3012034560 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004697390932420872, + "loss": 2.7066, + "theoretical_loss": 3.3197336706531546, + "tokens_seen": 3012100096 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004696321642429427, + "loss": 2.4956, + "theoretical_loss": 3.3197281217929255, + "tokens_seen": 3012165632 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046952523524379817, + "loss": 2.7685, + "theoretical_loss": 3.319722573087225, + "tokens_seen": 3012231168 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046941830624465353, + "loss": 2.6271, + "theoretical_loss": 3.319717024536045, + "tokens_seen": 3012296704 + }, + { + "epoch": 0.07, + "learning_rate": 0.000469311377245509, + "loss": 2.6398, + "theoretical_loss": 3.3197114761393784, + "tokens_seen": 3012362240 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004692044482463644, + "loss": 2.5921, + "theoretical_loss": 3.3197059278972176, + "tokens_seen": 3012427776 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046909751924721984, + "loss": 2.6227, + "theoretical_loss": 3.3197003798095546, + "tokens_seen": 3012493312 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046899059024807526, + "loss": 2.6272, + "theoretical_loss": 3.3196948318763817, + "tokens_seen": 3012558848 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046888366124893074, + "loss": 2.4564, + "theoretical_loss": 3.3196892840976915, + "tokens_seen": 3012624384 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046877673224978616, + "loss": 2.5827, + "theoretical_loss": 3.319683736473476, + "tokens_seen": 3012689920 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004686698032506416, + "loss": 2.5221, + "theoretical_loss": 3.319678189003728, + "tokens_seen": 3012755456 + }, + { + "epoch": 0.07, + "learning_rate": 0.000468562874251497, + "loss": 2.2411, + "theoretical_loss": 3.3196726416884395, + "tokens_seen": 3012820992 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046845594525235247, + "loss": 2.5399, + "theoretical_loss": 3.319667094527603, + "tokens_seen": 3012886528 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004683490162532079, + "loss": 2.5599, + "theoretical_loss": 3.3196615475212106, + "tokens_seen": 3012952064 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004682420872540633, + "loss": 2.6186, + "theoretical_loss": 3.319656000669255, + "tokens_seen": 3013017600 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004681351582549188, + "loss": 2.4364, + "theoretical_loss": 3.3196504539717284, + "tokens_seen": 3013083136 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046802822925577414, + "loss": 2.4645, + "theoretical_loss": 3.319644907428623, + "tokens_seen": 3013148672 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004679213002566296, + "loss": 2.3282, + "theoretical_loss": 3.3196393610399317, + "tokens_seen": 3013214208 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046781437125748503, + "loss": 2.3968, + "theoretical_loss": 3.3196338148056457, + "tokens_seen": 3013279744 + }, + { + "epoch": 0.07, + "objective/train/docs_used": 1652881, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.02851939201355, + "objective/train/theoretical_loss": 3.3196282687257583, + "objective/train/tokens_used": 43396576, + "theoretical_loss": 3.3196282687257583, + "tokens_seen": 3013345280 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004677074422583405, + "loss": 2.73, + "theoretical_loss": 3.3196282687257583, + "tokens_seen": 3013345280 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046760051325919587, + "loss": 2.7081, + "theoretical_loss": 3.3196227228002617, + "tokens_seen": 3013410816 + }, + { + "epoch": 0.07, + "learning_rate": 0.00046749358426005134, + "loss": 2.4167, + "theoretical_loss": 3.3196171770291483, + "tokens_seen": 3013476352 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004673866552609068, + "loss": 2.6501, + "theoretical_loss": 3.31961163141241, + "tokens_seen": 3013541888 + }, + { + "epoch": 0.07, + "learning_rate": 0.0004672797262617622, + "loss": 2.711, + "theoretical_loss": 3.3196060859500394, + "tokens_seen": 3013607424 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046717279726261765, + "loss": 2.6538, + "theoretical_loss": 3.319600540642029, + "tokens_seen": 3013672960 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046706586826347307, + "loss": 2.7554, + "theoretical_loss": 3.319594995488371, + "tokens_seen": 3013738496 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004669589392643285, + "loss": 2.6815, + "theoretical_loss": 3.3195894504890573, + "tokens_seen": 3013804032 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004668520102651839, + "loss": 2.4327, + "theoretical_loss": 3.3195839056440812, + "tokens_seen": 3013869568 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004667450812660394, + "loss": 2.6057, + "theoretical_loss": 3.3195783609534346, + "tokens_seen": 3013935104 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046663815226689474, + "loss": 2.8111, + "theoretical_loss": 3.3195728164171094, + "tokens_seen": 3014000640 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004665312232677502, + "loss": 2.5688, + "theoretical_loss": 3.3195672720350986, + "tokens_seen": 3014066176 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004664242942686057, + "loss": 2.4503, + "theoretical_loss": 3.319561727807394, + "tokens_seen": 3014131712 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004663173652694611, + "loss": 2.6877, + "theoretical_loss": 3.319556183733989, + "tokens_seen": 3014197248 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046621043627031653, + "loss": 2.6751, + "theoretical_loss": 3.3195506398148744, + "tokens_seen": 3014262784 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046610350727117195, + "loss": 2.4173, + "theoretical_loss": 3.3195450960500437, + "tokens_seen": 3014328320 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004659965782720274, + "loss": 2.6038, + "theoretical_loss": 3.319539552439489, + "tokens_seen": 3014393856 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004658896492728828, + "loss": 2.638, + "theoretical_loss": 3.3195340089832026, + "tokens_seen": 3014459392 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046578272027373826, + "loss": 2.5091, + "theoretical_loss": 3.3195284656811763, + "tokens_seen": 3014524928 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004656757912745937, + "loss": 2.807, + "theoretical_loss": 3.3195229225334035, + "tokens_seen": 3014590464 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004655688622754491, + "loss": 2.3861, + "theoretical_loss": 3.319517379539876, + "tokens_seen": 3014656000 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004654619332763045, + "loss": 2.8883, + "theoretical_loss": 3.3195118367005856, + "tokens_seen": 3014721536 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046535500427716, + "loss": 2.6202, + "theoretical_loss": 3.3195062940155258, + "tokens_seen": 3014787072 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004652480752780154, + "loss": 2.6868, + "theoretical_loss": 3.3195007514846884, + "tokens_seen": 3014852608 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004651411462788708, + "loss": 2.4873, + "theoretical_loss": 3.3194952091080654, + "tokens_seen": 3014918144 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 1653310, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 1.8691486120224, + "objective/train/theoretical_loss": 3.3194896668856497, + "objective/train/tokens_used": 45034976, + "theoretical_loss": 3.3194896668856497, + "tokens_seen": 3014983680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004650342172797263, + "loss": 2.2912, + "theoretical_loss": 3.3194896668856497, + "tokens_seen": 3014983680 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004649272882805817, + "loss": 2.609, + "theoretical_loss": 3.3194841248174334, + "tokens_seen": 3015049216 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046482035928143713, + "loss": 2.77, + "theoretical_loss": 3.319478582903409, + "tokens_seen": 3015114752 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046471343028229255, + "loss": 2.8639, + "theoretical_loss": 3.3194730411435684, + "tokens_seen": 3015180288 + }, + { + "epoch": 0.08, + "learning_rate": 0.000464606501283148, + "loss": 2.601, + "theoretical_loss": 3.3194674995379048, + "tokens_seen": 3015245824 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004644995722840034, + "loss": 2.2598, + "theoretical_loss": 3.3194619580864098, + "tokens_seen": 3015311360 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046439264328485886, + "loss": 2.6682, + "theoretical_loss": 3.3194564167890763, + "tokens_seen": 3015376896 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046428571428571433, + "loss": 2.5971, + "theoretical_loss": 3.3194508756458965, + "tokens_seen": 3015442432 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004641787852865697, + "loss": 2.5004, + "theoretical_loss": 3.319445334656862, + "tokens_seen": 3015507968 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046407185628742517, + "loss": 2.7767, + "theoretical_loss": 3.319439793821967, + "tokens_seen": 3015573504 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004639649272882806, + "loss": 2.6512, + "theoretical_loss": 3.3194342531412016, + "tokens_seen": 3015639040 + }, + { + "epoch": 0.08, + "learning_rate": 0.000463857998289136, + "loss": 2.6137, + "theoretical_loss": 3.3194287126145596, + "tokens_seen": 3015704576 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046375106928999143, + "loss": 2.4597, + "theoretical_loss": 3.319423172242033, + "tokens_seen": 3015770112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004636441402908469, + "loss": 2.5436, + "theoretical_loss": 3.3194176320236144, + "tokens_seen": 3015835648 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004635372112917023, + "loss": 2.5871, + "theoretical_loss": 3.319412091959296, + "tokens_seen": 3015901184 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046343028229255774, + "loss": 2.709, + "theoretical_loss": 3.31940655204907, + "tokens_seen": 3015966720 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004633233532934132, + "loss": 2.7242, + "theoretical_loss": 3.319401012292929, + "tokens_seen": 3016032256 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046321642429426863, + "loss": 2.4904, + "theoretical_loss": 3.319395472690865, + "tokens_seen": 3016097792 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046310949529512405, + "loss": 2.4424, + "theoretical_loss": 3.3193899332428707, + "tokens_seen": 3016163328 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046300256629597947, + "loss": 2.5032, + "theoretical_loss": 3.3193843939489382, + "tokens_seen": 3016228864 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046289563729683494, + "loss": 2.702, + "theoretical_loss": 3.3193788548090604, + "tokens_seen": 3016294400 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004627887082976903, + "loss": 2.622, + "theoretical_loss": 3.319373315823229, + "tokens_seen": 3016359936 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004626817792985458, + "loss": 2.7673, + "theoretical_loss": 3.319367776991437, + "tokens_seen": 3016425472 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004625748502994012, + "loss": 2.4798, + "theoretical_loss": 3.3193622383136763, + "tokens_seen": 3016491008 + }, + { + "epoch": 0.08, + "learning_rate": 0.00046246792130025667, + "loss": 2.919, + "theoretical_loss": 3.3193566997899397, + "tokens_seen": 3016556544 + }, + { + "epoch": 0.08, + "objective/train/docs_used": 1654610, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.208578109741211, + "objective/train/theoretical_loss": 3.3193511614202187, + "objective/train/tokens_used": 46673376, + "theoretical_loss": 3.3193511614202187, + "tokens_seen": 3016622080 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004623609923011121, + "loss": 2.5259, + "theoretical_loss": 3.3193511614202187, + "tokens_seen": 3016622080 + }, + { + "epoch": 0.08, + "learning_rate": 0.0004622540633019675, + "loss": 2.5413, + "theoretical_loss": 3.319345623204507, + "tokens_seen": 3016687616 + }, + { + "epoch": 0.09, + "learning_rate": 0.000462147134302823, + "loss": 2.7374, + "theoretical_loss": 3.319340085142796, + "tokens_seen": 3016753152 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046204020530367834, + "loss": 2.4026, + "theoretical_loss": 3.319334547235078, + "tokens_seen": 3016818688 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004619332763045338, + "loss": 2.5737, + "theoretical_loss": 3.319329009481346, + "tokens_seen": 3016884224 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046182634730538923, + "loss": 2.4429, + "theoretical_loss": 3.319323471881592, + "tokens_seen": 3016949760 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046171941830624465, + "loss": 2.6043, + "theoretical_loss": 3.3193179344358086, + "tokens_seen": 3017015296 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046161248930710007, + "loss": 2.5731, + "theoretical_loss": 3.3193123971439875, + "tokens_seen": 3017080832 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046150556030795554, + "loss": 2.6597, + "theoretical_loss": 3.319306860006122, + "tokens_seen": 3017146368 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004613986313088109, + "loss": 2.5896, + "theoretical_loss": 3.319301323022204, + "tokens_seen": 3017211904 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004612917023096664, + "loss": 2.7416, + "theoretical_loss": 3.319295786192226, + "tokens_seen": 3017277440 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046118477331052185, + "loss": 2.763, + "theoretical_loss": 3.3192902495161802, + "tokens_seen": 3017342976 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046107784431137727, + "loss": 2.6935, + "theoretical_loss": 3.319284712994059, + "tokens_seen": 3017408512 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004609709153122327, + "loss": 2.6153, + "theoretical_loss": 3.319279176625855, + "tokens_seen": 3017474048 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004608639863130881, + "loss": 2.5454, + "theoretical_loss": 3.3192736404115606, + "tokens_seen": 3017539584 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004607570573139436, + "loss": 2.564, + "theoretical_loss": 3.319268104351168, + "tokens_seen": 3017605120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046065012831479895, + "loss": 2.4576, + "theoretical_loss": 3.3192625684446693, + "tokens_seen": 3017670656 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004605431993156544, + "loss": 2.8301, + "theoretical_loss": 3.3192570326920574, + "tokens_seen": 3017736192 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046043627031650984, + "loss": 2.5667, + "theoretical_loss": 3.3192514970933242, + "tokens_seen": 3017801728 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046032934131736526, + "loss": 2.2958, + "theoretical_loss": 3.3192459616484626, + "tokens_seen": 3017867264 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046022241231822073, + "loss": 2.5172, + "theoretical_loss": 3.319240426357465, + "tokens_seen": 3017932800 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046011548331907615, + "loss": 2.6184, + "theoretical_loss": 3.319234891220323, + "tokens_seen": 3017998336 + }, + { + "epoch": 0.09, + "learning_rate": 0.00046000855431993157, + "loss": 2.6125, + "theoretical_loss": 3.31922935623703, + "tokens_seen": 3018063872 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459901625320787, + "loss": 2.5618, + "theoretical_loss": 3.3192238214075775, + "tokens_seen": 3018129408 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045979469632164246, + "loss": 2.6876, + "theoretical_loss": 3.3192182867319584, + "tokens_seen": 3018194944 + }, + { + "epoch": 0.09, + "objective/train/docs_used": 1655301, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.565507411956787, + "objective/train/theoretical_loss": 3.319212752210165, + "objective/train/tokens_used": 48311776, + "theoretical_loss": 3.319212752210165, + "tokens_seen": 3018260480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004596877673224979, + "loss": 2.4072, + "theoretical_loss": 3.319212752210165, + "tokens_seen": 3018260480 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004595808383233533, + "loss": 2.565, + "theoretical_loss": 3.3192072178421896, + "tokens_seen": 3018326016 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004594739093242087, + "loss": 2.7793, + "theoretical_loss": 3.3192016836280245, + "tokens_seen": 3018391552 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004593669803250642, + "loss": 2.587, + "theoretical_loss": 3.319196149567662, + "tokens_seen": 3018457088 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004592600513259196, + "loss": 2.6595, + "theoretical_loss": 3.3191906156610953, + "tokens_seen": 3018522624 + }, + { + "epoch": 0.09, + "learning_rate": 0.000459153122326775, + "loss": 2.6921, + "theoretical_loss": 3.3191850819083157, + "tokens_seen": 3018588160 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004590461933276305, + "loss": 2.5763, + "theoretical_loss": 3.3191795483093163, + "tokens_seen": 3018653696 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045893926432848586, + "loss": 2.5511, + "theoretical_loss": 3.319174014864089, + "tokens_seen": 3018719232 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045883233532934134, + "loss": 2.4876, + "theoretical_loss": 3.3191684815726266, + "tokens_seen": 3018784768 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045872540633019675, + "loss": 2.5574, + "theoretical_loss": 3.319162948434921, + "tokens_seen": 3018850304 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045861847733105217, + "loss": 2.6569, + "theoretical_loss": 3.3191574154509658, + "tokens_seen": 3018915840 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004585115483319076, + "loss": 2.6353, + "theoretical_loss": 3.319151882620752, + "tokens_seen": 3018981376 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045840461933276306, + "loss": 2.6696, + "theoretical_loss": 3.319146349944272, + "tokens_seen": 3019046912 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045829769033361854, + "loss": 2.5971, + "theoretical_loss": 3.3191408174215193, + "tokens_seen": 3019112448 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004581907613344739, + "loss": 2.6662, + "theoretical_loss": 3.319135285052486, + "tokens_seen": 3019177984 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004580838323353294, + "loss": 2.6373, + "theoretical_loss": 3.3191297528371635, + "tokens_seen": 3019243520 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004579769033361848, + "loss": 2.4068, + "theoretical_loss": 3.319124220775545, + "tokens_seen": 3019309056 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004578699743370402, + "loss": 2.5871, + "theoretical_loss": 3.319118688867623, + "tokens_seen": 3019374592 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045776304533789563, + "loss": 2.5294, + "theoretical_loss": 3.319113157113389, + "tokens_seen": 3019440128 + }, + { + "epoch": 0.09, + "learning_rate": 0.0004576561163387511, + "loss": 2.4393, + "theoretical_loss": 3.319107625512837, + "tokens_seen": 3019505664 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045754918733960647, + "loss": 2.6992, + "theoretical_loss": 3.3191020940659577, + "tokens_seen": 3019571200 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045744225834046194, + "loss": 2.5488, + "theoretical_loss": 3.3190965627727445, + "tokens_seen": 3019636736 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045733532934131736, + "loss": 2.5638, + "theoretical_loss": 3.3190910316331896, + "tokens_seen": 3019702272 + }, + { + "epoch": 0.09, + "learning_rate": 0.00045722840034217283, + "loss": 2.6045, + "theoretical_loss": 3.3190855006472857, + "tokens_seen": 3019767808 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045712147134302825, + "loss": 2.5425, + "theoretical_loss": 3.319079969815024, + "tokens_seen": 3019833344 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 1656638, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6638667583465576, + "objective/train/theoretical_loss": 3.3190744391363984, + "objective/train/tokens_used": 49950176, + "theoretical_loss": 3.3190744391363984, + "tokens_seen": 3019898880 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045701454234388367, + "loss": 2.6716, + "theoretical_loss": 3.3190744391363984, + "tokens_seen": 3019898880 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045690761334473914, + "loss": 2.6018, + "theoretical_loss": 3.3190689086114005, + "tokens_seen": 3019964416 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004568006843455945, + "loss": 2.4702, + "theoretical_loss": 3.3190633782400223, + "tokens_seen": 3020029952 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045669375534645, + "loss": 2.5531, + "theoretical_loss": 3.3190578480222577, + "tokens_seen": 3020095488 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004565868263473054, + "loss": 2.5263, + "theoretical_loss": 3.3190523179580973, + "tokens_seen": 3020161024 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004564798973481608, + "loss": 2.6229, + "theoretical_loss": 3.3190467880475345, + "tokens_seen": 3020226560 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045637296834901623, + "loss": 2.479, + "theoretical_loss": 3.3190412582905617, + "tokens_seen": 3020292096 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004562660393498717, + "loss": 2.6546, + "theoretical_loss": 3.319035728687171, + "tokens_seen": 3020357632 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004561591103507271, + "loss": 2.5858, + "theoretical_loss": 3.319030199237355, + "tokens_seen": 3020423168 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045605218135158254, + "loss": 2.3675, + "theoretical_loss": 3.319024669941106, + "tokens_seen": 3020488704 + }, + { + "epoch": 0.1, + "learning_rate": 0.000455945252352438, + "loss": 2.5846, + "theoretical_loss": 3.3190191407984164, + "tokens_seen": 3020554240 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045583832335329344, + "loss": 2.4535, + "theoretical_loss": 3.319013611809279, + "tokens_seen": 3020619776 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045573139435414885, + "loss": 2.7591, + "theoretical_loss": 3.3190080829736854, + "tokens_seen": 3020685312 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004556244653550043, + "loss": 2.5729, + "theoretical_loss": 3.319002554291629, + "tokens_seen": 3020750848 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045551753635585975, + "loss": 2.5332, + "theoretical_loss": 3.318997025763101, + "tokens_seen": 3020816384 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004554106073567151, + "loss": 2.5638, + "theoretical_loss": 3.3189914973880947, + "tokens_seen": 3020881920 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004553036783575706, + "loss": 2.476, + "theoretical_loss": 3.318985969166602, + "tokens_seen": 3020947456 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045519674935842606, + "loss": 2.7148, + "theoretical_loss": 3.318980441098616, + "tokens_seen": 3021012992 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004550898203592814, + "loss": 2.5545, + "theoretical_loss": 3.3189749131841286, + "tokens_seen": 3021078528 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004549828913601369, + "loss": 2.5361, + "theoretical_loss": 3.3189693854231326, + "tokens_seen": 3021144064 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004548759623609923, + "loss": 2.7275, + "theoretical_loss": 3.3189638578156195, + "tokens_seen": 3021209600 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045476903336184773, + "loss": 2.7548, + "theoretical_loss": 3.3189583303615824, + "tokens_seen": 3021275136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045466210436270315, + "loss": 2.6902, + "theoretical_loss": 3.3189528030610136, + "tokens_seen": 3021340672 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004545551753635586, + "loss": 2.3308, + "theoretical_loss": 3.318947275913906, + "tokens_seen": 3021406208 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045444824636441404, + "loss": 2.437, + "theoretical_loss": 3.318941748920251, + "tokens_seen": 3021471744 + }, + { + "epoch": 0.1, + "objective/train/docs_used": 1657764, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8103880882263184, + "objective/train/theoretical_loss": 3.318936222080042, + "objective/train/tokens_used": 51588576, + "theoretical_loss": 3.318936222080042, + "tokens_seen": 3021537280 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045434131736526946, + "loss": 2.5186, + "theoretical_loss": 3.318936222080042, + "tokens_seen": 3021537280 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045423438836612493, + "loss": 2.619, + "theoretical_loss": 3.318930695393271, + "tokens_seen": 3021602816 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045412745936698035, + "loss": 2.6483, + "theoretical_loss": 3.3189251688599297, + "tokens_seen": 3021668352 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045402053036783577, + "loss": 2.446, + "theoretical_loss": 3.3189196424800116, + "tokens_seen": 3021733888 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004539136013686912, + "loss": 2.2355, + "theoretical_loss": 3.318914116253509, + "tokens_seen": 3021799424 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045380667236954666, + "loss": 2.4651, + "theoretical_loss": 3.3189085901804134, + "tokens_seen": 3021864960 + }, + { + "epoch": 0.1, + "learning_rate": 0.000453699743370402, + "loss": 2.3771, + "theoretical_loss": 3.3189030642607182, + "tokens_seen": 3021930496 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004535928143712575, + "loss": 2.2859, + "theoretical_loss": 3.3188975384944155, + "tokens_seen": 3021996032 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004534858853721129, + "loss": 2.4815, + "theoretical_loss": 3.3188920128814976, + "tokens_seen": 3022061568 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045337895637296834, + "loss": 2.4833, + "theoretical_loss": 3.318886487421957, + "tokens_seen": 3022127104 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045327202737382375, + "loss": 2.6793, + "theoretical_loss": 3.3188809621157858, + "tokens_seen": 3022192640 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045316509837467923, + "loss": 2.4563, + "theoretical_loss": 3.318875436962977, + "tokens_seen": 3022258176 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004530581693755347, + "loss": 2.5047, + "theoretical_loss": 3.3188699119635228, + "tokens_seen": 3022323712 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045295124037639006, + "loss": 2.7077, + "theoretical_loss": 3.3188643871174155, + "tokens_seen": 3022389248 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045284431137724554, + "loss": 2.5694, + "theoretical_loss": 3.3188588624246473, + "tokens_seen": 3022454784 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045273738237810096, + "loss": 2.4641, + "theoretical_loss": 3.318853337885211, + "tokens_seen": 3022520320 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004526304533789564, + "loss": 2.5008, + "theoretical_loss": 3.318847813499099, + "tokens_seen": 3022585856 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004525235243798118, + "loss": 2.5933, + "theoretical_loss": 3.318842289266304, + "tokens_seen": 3022651392 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045241659538066727, + "loss": 2.7056, + "theoretical_loss": 3.318836765186817, + "tokens_seen": 3022716928 + }, + { + "epoch": 0.1, + "learning_rate": 0.00045230966638152263, + "loss": 2.5471, + "theoretical_loss": 3.3188312412606327, + "tokens_seen": 3022782464 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004522027373823781, + "loss": 2.6295, + "theoretical_loss": 3.3188257174877416, + "tokens_seen": 3022848000 + }, + { + "epoch": 0.1, + "learning_rate": 0.0004520958083832336, + "loss": 2.5878, + "theoretical_loss": 3.3188201938681368, + "tokens_seen": 3022913536 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045198887938408894, + "loss": 2.7199, + "theoretical_loss": 3.3188146704018107, + "tokens_seen": 3022979072 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004518819503849444, + "loss": 2.441, + "theoretical_loss": 3.318809147088756, + "tokens_seen": 3023044608 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045177502138579983, + "loss": 2.5828, + "theoretical_loss": 3.318803623928965, + "tokens_seen": 3023110144 + }, + { + "epoch": 0.11, + "objective/train/docs_used": 1658338, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4892239570617676, + "objective/train/theoretical_loss": 3.3187981009224297, + "objective/train/tokens_used": 53226976, + "theoretical_loss": 3.3187981009224297, + "tokens_seen": 3023175680 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004516680923866553, + "loss": 2.4785, + "theoretical_loss": 3.3187981009224297, + "tokens_seen": 3023175680 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045156116338751067, + "loss": 2.4167, + "theoretical_loss": 3.318792578069143, + "tokens_seen": 3023241216 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045145423438836614, + "loss": 2.4702, + "theoretical_loss": 3.3187870553690972, + "tokens_seen": 3023306752 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045134730538922156, + "loss": 2.3209, + "theoretical_loss": 3.3187815328222845, + "tokens_seen": 3023372288 + }, + { + "epoch": 0.11, + "learning_rate": 0.000451240376390077, + "loss": 2.5124, + "theoretical_loss": 3.3187760104286976, + "tokens_seen": 3023437824 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045113344739093245, + "loss": 2.5166, + "theoretical_loss": 3.3187704881883286, + "tokens_seen": 3023503360 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045102651839178787, + "loss": 2.6266, + "theoretical_loss": 3.3187649661011704, + "tokens_seen": 3023568896 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004509195893926433, + "loss": 2.6106, + "theoretical_loss": 3.3187594441672155, + "tokens_seen": 3023634432 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004508126603934987, + "loss": 2.768, + "theoretical_loss": 3.3187539223864557, + "tokens_seen": 3023699968 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004507057313943542, + "loss": 2.5288, + "theoretical_loss": 3.3187484007588837, + "tokens_seen": 3023765504 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004505988023952096, + "loss": 2.4277, + "theoretical_loss": 3.318742879284492, + "tokens_seen": 3023831040 + }, + { + "epoch": 0.11, + "learning_rate": 0.000450491873396065, + "loss": 2.7191, + "theoretical_loss": 3.3187373579632733, + "tokens_seen": 3023896576 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045038494439692044, + "loss": 2.5977, + "theoretical_loss": 3.3187318367952194, + "tokens_seen": 3023962112 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004502780153977759, + "loss": 2.5787, + "theoretical_loss": 3.3187263157803235, + "tokens_seen": 3024027648 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045017108639863133, + "loss": 2.7877, + "theoretical_loss": 3.318720794918577, + "tokens_seen": 3024093184 + }, + { + "epoch": 0.11, + "learning_rate": 0.00045006415739948675, + "loss": 2.6564, + "theoretical_loss": 3.3187152742099735, + "tokens_seen": 3024158720 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004499572284003422, + "loss": 2.7045, + "theoretical_loss": 3.3187097536545047, + "tokens_seen": 3024224256 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004498502994011976, + "loss": 2.6133, + "theoretical_loss": 3.3187042332521632, + "tokens_seen": 3024289792 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044974337040205306, + "loss": 2.4414, + "theoretical_loss": 3.3186987130029415, + "tokens_seen": 3024355328 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004496364414029085, + "loss": 2.6137, + "theoretical_loss": 3.318693192906832, + "tokens_seen": 3024420864 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004495295124037639, + "loss": 2.5737, + "theoretical_loss": 3.3186876729638266, + "tokens_seen": 3024486400 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004494225834046193, + "loss": 2.6552, + "theoretical_loss": 3.318682153173919, + "tokens_seen": 3024551936 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004493156544054748, + "loss": 2.5312, + "theoretical_loss": 3.3186766335371005, + "tokens_seen": 3024617472 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044920872540633026, + "loss": 2.5598, + "theoretical_loss": 3.3186711140533642, + "tokens_seen": 3024683008 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004491017964071856, + "loss": 2.6287, + "theoretical_loss": 3.318665594722702, + "tokens_seen": 3024748544 + }, + { + "epoch": 0.11, + "objective/train/docs_used": 1659421, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7263076305389404, + "objective/train/theoretical_loss": 3.3186600755451066, + "objective/train/tokens_used": 54865376, + "theoretical_loss": 3.3186600755451066, + "tokens_seen": 3024814080 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004489948674080411, + "loss": 2.5863, + "theoretical_loss": 3.3186600755451066, + "tokens_seen": 3024814080 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004488879384088965, + "loss": 2.4847, + "theoretical_loss": 3.3186545565205705, + "tokens_seen": 3024879616 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044878100940975193, + "loss": 2.6246, + "theoretical_loss": 3.3186490376490863, + "tokens_seen": 3024945152 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044867408041060735, + "loss": 2.4603, + "theoretical_loss": 3.318643518930646, + "tokens_seen": 3025010688 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004485671514114628, + "loss": 2.5178, + "theoretical_loss": 3.318638000365242, + "tokens_seen": 3025076224 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004484602224123182, + "loss": 2.5561, + "theoretical_loss": 3.3186324819528674, + "tokens_seen": 3025141760 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044835329341317366, + "loss": 2.5152, + "theoretical_loss": 3.318626963693514, + "tokens_seen": 3025207296 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004482463644140291, + "loss": 2.4108, + "theoretical_loss": 3.318621445587175, + "tokens_seen": 3025272832 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004481394354148845, + "loss": 2.7335, + "theoretical_loss": 3.3186159276338416, + "tokens_seen": 3025338368 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044803250641573997, + "loss": 2.4303, + "theoretical_loss": 3.3186104098335076, + "tokens_seen": 3025403904 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004479255774165954, + "loss": 2.3138, + "theoretical_loss": 3.3186048921861646, + "tokens_seen": 3025469440 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044781864841745086, + "loss": 2.4013, + "theoretical_loss": 3.318599374691805, + "tokens_seen": 3025534976 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044771171941830623, + "loss": 2.3985, + "theoretical_loss": 3.3185938573504217, + "tokens_seen": 3025600512 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004476047904191617, + "loss": 2.6061, + "theoretical_loss": 3.318588340162007, + "tokens_seen": 3025666048 + }, + { + "epoch": 0.11, + "learning_rate": 0.0004474978614200171, + "loss": 2.659, + "theoretical_loss": 3.3185828231265533, + "tokens_seen": 3025731584 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044739093242087254, + "loss": 2.2818, + "theoretical_loss": 3.3185773062440527, + "tokens_seen": 3025797120 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044728400342172796, + "loss": 2.4729, + "theoretical_loss": 3.3185717895144986, + "tokens_seen": 3025862656 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044717707442258343, + "loss": 2.6619, + "theoretical_loss": 3.3185662729378826, + "tokens_seen": 3025928192 + }, + { + "epoch": 0.11, + "learning_rate": 0.00044707014542343885, + "loss": 2.5107, + "theoretical_loss": 3.3185607565141972, + "tokens_seen": 3025993728 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044696321642429427, + "loss": 2.3266, + "theoretical_loss": 3.318555240243435, + "tokens_seen": 3026059264 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044685628742514974, + "loss": 2.6211, + "theoretical_loss": 3.318549724125589, + "tokens_seen": 3026124800 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004467493584260051, + "loss": 2.3811, + "theoretical_loss": 3.3185442081606507, + "tokens_seen": 3026190336 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004466424294268606, + "loss": 2.5501, + "theoretical_loss": 3.318538692348613, + "tokens_seen": 3026255872 + }, + { + "epoch": 0.12, + "learning_rate": 0.000446535500427716, + "loss": 2.7668, + "theoretical_loss": 3.3185331766894683, + "tokens_seen": 3026321408 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044642857142857147, + "loss": 2.6323, + "theoretical_loss": 3.318527661183209, + "tokens_seen": 3026386944 + }, + { + "epoch": 0.12, + "objective/train/docs_used": 1660021, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.0193262100219727, + "objective/train/theoretical_loss": 3.318522145829828, + "objective/train/tokens_used": 56503776, + "theoretical_loss": 3.318522145829828, + "tokens_seen": 3026452480 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044632164242942683, + "loss": 2.8896, + "theoretical_loss": 3.318522145829828, + "tokens_seen": 3026452480 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004462147134302823, + "loss": 2.4917, + "theoretical_loss": 3.318516630629317, + "tokens_seen": 3026518016 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004461077844311378, + "loss": 2.3977, + "theoretical_loss": 3.318511115581669, + "tokens_seen": 3026583552 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044600085543199314, + "loss": 2.5538, + "theoretical_loss": 3.3185056006868763, + "tokens_seen": 3026649088 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004458939264328486, + "loss": 2.41, + "theoretical_loss": 3.3185000859449314, + "tokens_seen": 3026714624 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044578699743370403, + "loss": 2.4, + "theoretical_loss": 3.318494571355827, + "tokens_seen": 3026780160 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044568006843455945, + "loss": 2.7303, + "theoretical_loss": 3.3184890569195544, + "tokens_seen": 3026845696 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044557313943541487, + "loss": 2.4075, + "theoretical_loss": 3.3184835426361077, + "tokens_seen": 3026911232 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044546621043627034, + "loss": 2.4872, + "theoretical_loss": 3.3184780285054782, + "tokens_seen": 3026976768 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044535928143712576, + "loss": 2.4217, + "theoretical_loss": 3.318472514527659, + "tokens_seen": 3027042304 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004452523524379812, + "loss": 2.6315, + "theoretical_loss": 3.318467000702642, + "tokens_seen": 3027107840 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004451454234388366, + "loss": 2.553, + "theoretical_loss": 3.31846148703042, + "tokens_seen": 3027173376 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004450384944396921, + "loss": 2.3969, + "theoretical_loss": 3.3184559735109853, + "tokens_seen": 3027238912 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004449315654405475, + "loss": 2.4959, + "theoretical_loss": 3.3184504601443305, + "tokens_seen": 3027304448 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004448246364414029, + "loss": 2.7329, + "theoretical_loss": 3.3184449469304482, + "tokens_seen": 3027369984 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004447177074422584, + "loss": 2.6387, + "theoretical_loss": 3.318439433869331, + "tokens_seen": 3027435520 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044461077844311375, + "loss": 2.5593, + "theoretical_loss": 3.3184339209609703, + "tokens_seen": 3027501056 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004445038494439692, + "loss": 2.5025, + "theoretical_loss": 3.3184284082053597, + "tokens_seen": 3027566592 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044439692044482464, + "loss": 2.8454, + "theoretical_loss": 3.318422895602491, + "tokens_seen": 3027632128 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044428999144568006, + "loss": 2.5777, + "theoretical_loss": 3.318417383152357, + "tokens_seen": 3027697664 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004441830624465355, + "loss": 2.5114, + "theoretical_loss": 3.31841187085495, + "tokens_seen": 3027763200 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044407613344739095, + "loss": 2.4808, + "theoretical_loss": 3.3184063587102632, + "tokens_seen": 3027828736 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004439692044482464, + "loss": 2.6478, + "theoretical_loss": 3.318400846718288, + "tokens_seen": 3027894272 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004438622754491018, + "loss": 2.3825, + "theoretical_loss": 3.318395334879017, + "tokens_seen": 3027959808 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044375534644995726, + "loss": 2.4853, + "theoretical_loss": 3.318389823192443, + "tokens_seen": 3028025344 + }, + { + "epoch": 0.12, + "objective/train/docs_used": 1661280, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6410443782806396, + "objective/train/theoretical_loss": 3.3183843116585585, + "objective/train/tokens_used": 58142176, + "theoretical_loss": 3.3183843116585585, + "tokens_seen": 3028090880 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004436484174508127, + "loss": 2.469, + "theoretical_loss": 3.3183843116585585, + "tokens_seen": 3028090880 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004435414884516681, + "loss": 2.7035, + "theoretical_loss": 3.318378800277356, + "tokens_seen": 3028156416 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004434345594525235, + "loss": 2.5603, + "theoretical_loss": 3.318373289048828, + "tokens_seen": 3028221952 + }, + { + "epoch": 0.12, + "learning_rate": 0.000443327630453379, + "loss": 2.5599, + "theoretical_loss": 3.3183677779729663, + "tokens_seen": 3028287488 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044322070145423435, + "loss": 2.5114, + "theoretical_loss": 3.318362267049764, + "tokens_seen": 3028353024 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004431137724550898, + "loss": 2.6164, + "theoretical_loss": 3.3183567562792136, + "tokens_seen": 3028418560 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004430068434559453, + "loss": 2.6386, + "theoretical_loss": 3.318351245661307, + "tokens_seen": 3028484096 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044289991445680066, + "loss": 2.6347, + "theoretical_loss": 3.3183457351960377, + "tokens_seen": 3028549632 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044279298545765614, + "loss": 2.6369, + "theoretical_loss": 3.3183402248833973, + "tokens_seen": 3028615168 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044268605645851155, + "loss": 2.6041, + "theoretical_loss": 3.3183347147233784, + "tokens_seen": 3028680704 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044257912745936703, + "loss": 2.4263, + "theoretical_loss": 3.318329204715974, + "tokens_seen": 3028746240 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004424721984602224, + "loss": 2.5039, + "theoretical_loss": 3.3183236948611756, + "tokens_seen": 3028811776 + }, + { + "epoch": 0.12, + "learning_rate": 0.00044236526946107786, + "loss": 2.6112, + "theoretical_loss": 3.3183181851589763, + "tokens_seen": 3028877312 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004422583404619333, + "loss": 2.512, + "theoretical_loss": 3.3183126756093686, + "tokens_seen": 3028942848 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004421514114627887, + "loss": 2.4492, + "theoretical_loss": 3.318307166212345, + "tokens_seen": 3029008384 + }, + { + "epoch": 0.12, + "learning_rate": 0.0004420444824636442, + "loss": 2.7508, + "theoretical_loss": 3.318301656967898, + "tokens_seen": 3029073920 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004419375534644996, + "loss": 2.6316, + "theoretical_loss": 3.318296147876019, + "tokens_seen": 3029139456 + }, + { + "epoch": 0.13, + "learning_rate": 0.000441830624465355, + "loss": 2.6729, + "theoretical_loss": 3.3182906389367024, + "tokens_seen": 3029204992 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044172369546621043, + "loss": 2.7944, + "theoretical_loss": 3.318285130149939, + "tokens_seen": 3029270528 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004416167664670659, + "loss": 2.557, + "theoretical_loss": 3.3182796215157224, + "tokens_seen": 3029336064 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044150983746792127, + "loss": 2.5116, + "theoretical_loss": 3.3182741130340445, + "tokens_seen": 3029401600 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044140290846877674, + "loss": 2.3178, + "theoretical_loss": 3.318268604704898, + "tokens_seen": 3029467136 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044129597946963216, + "loss": 2.4742, + "theoretical_loss": 3.318263096528275, + "tokens_seen": 3029532672 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044118905047048763, + "loss": 2.4291, + "theoretical_loss": 3.318257588504168, + "tokens_seen": 3029598208 + }, + { + "epoch": 0.13, + "learning_rate": 0.000441082121471343, + "loss": 2.3404, + "theoretical_loss": 3.31825208063257, + "tokens_seen": 3029663744 + }, + { + "epoch": 0.13, + "objective/train/docs_used": 1661975, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.466115951538086, + "objective/train/theoretical_loss": 3.318246572913474, + "objective/train/tokens_used": 59780576, + "theoretical_loss": 3.318246572913474, + "tokens_seen": 3029729280 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044097519247219847, + "loss": 2.4917, + "theoretical_loss": 3.318246572913474, + "tokens_seen": 3029729280 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044086826347305394, + "loss": 2.4753, + "theoretical_loss": 3.3182410653468706, + "tokens_seen": 3029794816 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004407613344739093, + "loss": 2.6818, + "theoretical_loss": 3.318235557932754, + "tokens_seen": 3029860352 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004406544054747648, + "loss": 2.5633, + "theoretical_loss": 3.3182300506711155, + "tokens_seen": 3029925888 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004405474764756202, + "loss": 2.3311, + "theoretical_loss": 3.318224543561948, + "tokens_seen": 3029991424 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004404405474764756, + "loss": 2.323, + "theoretical_loss": 3.318219036605245, + "tokens_seen": 3030056960 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044033361847733104, + "loss": 2.4226, + "theoretical_loss": 3.3182135298009974, + "tokens_seen": 3030122496 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004402266894781865, + "loss": 2.2801, + "theoretical_loss": 3.3182080231491984, + "tokens_seen": 3030188032 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044011976047904193, + "loss": 2.8268, + "theoretical_loss": 3.3182025166498406, + "tokens_seen": 3030253568 + }, + { + "epoch": 0.13, + "learning_rate": 0.00044001283147989735, + "loss": 2.6459, + "theoretical_loss": 3.3181970103029164, + "tokens_seen": 3030319104 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004399059024807528, + "loss": 2.4968, + "theoretical_loss": 3.3181915041084182, + "tokens_seen": 3030384640 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043979897348160824, + "loss": 2.5702, + "theoretical_loss": 3.3181859980663386, + "tokens_seen": 3030450176 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043969204448246366, + "loss": 2.4552, + "theoretical_loss": 3.3181804921766695, + "tokens_seen": 3030515712 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004395851154833191, + "loss": 2.3615, + "theoretical_loss": 3.3181749864394043, + "tokens_seen": 3030581248 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043947818648417455, + "loss": 2.5737, + "theoretical_loss": 3.318169480854535, + "tokens_seen": 3030646784 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004393712574850299, + "loss": 2.6029, + "theoretical_loss": 3.318163975422054, + "tokens_seen": 3030712320 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004392643284858854, + "loss": 2.4033, + "theoretical_loss": 3.318158470141954, + "tokens_seen": 3030777856 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004391573994867408, + "loss": 2.5225, + "theoretical_loss": 3.318152965014227, + "tokens_seen": 3030843392 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004390504704875962, + "loss": 2.4062, + "theoretical_loss": 3.3181474600388667, + "tokens_seen": 3030908928 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004389435414884517, + "loss": 2.5334, + "theoretical_loss": 3.3181419552158644, + "tokens_seen": 3030974464 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004388366124893071, + "loss": 2.2213, + "theoretical_loss": 3.318136450545213, + "tokens_seen": 3031040000 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004387296834901626, + "loss": 2.5278, + "theoretical_loss": 3.318130946026905, + "tokens_seen": 3031105536 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043862275449101795, + "loss": 2.5084, + "theoretical_loss": 3.318125441660933, + "tokens_seen": 3031171072 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004385158254918734, + "loss": 2.6303, + "theoretical_loss": 3.318119937447289, + "tokens_seen": 3031236608 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043840889649272884, + "loss": 2.5961, + "theoretical_loss": 3.318114433385966, + "tokens_seen": 3031302144 + }, + { + "epoch": 0.13, + "objective/train/docs_used": 1662642, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.3627991676330566, + "objective/train/theoretical_loss": 3.3181089294769563, + "objective/train/tokens_used": 61418976, + "theoretical_loss": 3.3181089294769563, + "tokens_seen": 3031367680 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043830196749358426, + "loss": 2.6984, + "theoretical_loss": 3.3181089294769563, + "tokens_seen": 3031367680 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004381950384944397, + "loss": 2.7248, + "theoretical_loss": 3.3181034257202526, + "tokens_seen": 3031433216 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043808810949529515, + "loss": 2.6676, + "theoretical_loss": 3.318097922115847, + "tokens_seen": 3031498752 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043798118049615057, + "loss": 2.4558, + "theoretical_loss": 3.318092418663732, + "tokens_seen": 3031564288 + }, + { + "epoch": 0.13, + "learning_rate": 0.000437874251497006, + "loss": 2.3336, + "theoretical_loss": 3.318086915363901, + "tokens_seen": 3031629824 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043776732249786146, + "loss": 2.5785, + "theoretical_loss": 3.3180814122163453, + "tokens_seen": 3031695360 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004376603934987168, + "loss": 2.3364, + "theoretical_loss": 3.318075909221058, + "tokens_seen": 3031760896 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004375534644995723, + "loss": 2.587, + "theoretical_loss": 3.3180704063780313, + "tokens_seen": 3031826432 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004374465355004277, + "loss": 2.4558, + "theoretical_loss": 3.3180649036872585, + "tokens_seen": 3031891968 + }, + { + "epoch": 0.13, + "learning_rate": 0.0004373396065012832, + "loss": 2.3549, + "theoretical_loss": 3.318059401148731, + "tokens_seen": 3031957504 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043723267750213856, + "loss": 2.5263, + "theoretical_loss": 3.3180538987624417, + "tokens_seen": 3032023040 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043712574850299403, + "loss": 2.5736, + "theoretical_loss": 3.3180483965283836, + "tokens_seen": 3032088576 + }, + { + "epoch": 0.13, + "learning_rate": 0.00043701881950384945, + "loss": 2.7003, + "theoretical_loss": 3.3180428944465485, + "tokens_seen": 3032154112 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043691189050470487, + "loss": 2.654, + "theoretical_loss": 3.318037392516929, + "tokens_seen": 3032219648 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043680496150556034, + "loss": 2.6114, + "theoretical_loss": 3.318031890739518, + "tokens_seen": 3032285184 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043669803250641576, + "loss": 2.5595, + "theoretical_loss": 3.318026389114308, + "tokens_seen": 3032350720 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004365911035072712, + "loss": 2.7492, + "theoretical_loss": 3.318020887641291, + "tokens_seen": 3032416256 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004364841745081266, + "loss": 2.5468, + "theoretical_loss": 3.3180153863204596, + "tokens_seen": 3032481792 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043637724550898207, + "loss": 2.7061, + "theoretical_loss": 3.318009885151807, + "tokens_seen": 3032547328 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043627031650983743, + "loss": 2.5121, + "theoretical_loss": 3.3180043841353246, + "tokens_seen": 3032612864 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004361633875106929, + "loss": 2.5887, + "theoretical_loss": 3.317998883271006, + "tokens_seen": 3032678400 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004360564585115483, + "loss": 2.5448, + "theoretical_loss": 3.317993382558843, + "tokens_seen": 3032743936 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004359495295124038, + "loss": 2.506, + "theoretical_loss": 3.317987881998828, + "tokens_seen": 3032809472 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004358426005132592, + "loss": 2.5888, + "theoretical_loss": 3.317982381590954, + "tokens_seen": 3032875008 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043573567151411463, + "loss": 2.6217, + "theoretical_loss": 3.3179768813352135, + "tokens_seen": 3032940544 + }, + { + "epoch": 0.14, + "objective/train/docs_used": 1663221, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.339816093444824, + "objective/train/theoretical_loss": 3.3179713812315983, + "objective/train/tokens_used": 63057376, + "theoretical_loss": 3.3179713812315983, + "tokens_seen": 3033006080 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004356287425149701, + "loss": 2.6192, + "theoretical_loss": 3.3179713812315983, + "tokens_seen": 3033006080 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043552181351582547, + "loss": 2.4596, + "theoretical_loss": 3.317965881280102, + "tokens_seen": 3033071616 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043541488451668094, + "loss": 2.3786, + "theoretical_loss": 3.317960381480716, + "tokens_seen": 3033137152 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043530795551753636, + "loss": 2.3844, + "theoretical_loss": 3.3179548818334337, + "tokens_seen": 3033202688 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004352010265183918, + "loss": 2.6781, + "theoretical_loss": 3.317949382338247, + "tokens_seen": 3033268224 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004350940975192472, + "loss": 2.6249, + "theoretical_loss": 3.3179438829951486, + "tokens_seen": 3033333760 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043498716852010267, + "loss": 2.7442, + "theoretical_loss": 3.3179383838041314, + "tokens_seen": 3033399296 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004348802395209581, + "loss": 2.5186, + "theoretical_loss": 3.317932884765187, + "tokens_seen": 3033464832 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004347733105218135, + "loss": 2.7132, + "theoretical_loss": 3.317927385878309, + "tokens_seen": 3033530368 + }, + { + "epoch": 0.14, + "learning_rate": 0.000434666381522669, + "loss": 2.6596, + "theoretical_loss": 3.317921887143489, + "tokens_seen": 3033595904 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004345594525235244, + "loss": 2.5732, + "theoretical_loss": 3.31791638856072, + "tokens_seen": 3033661440 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004344525235243798, + "loss": 2.6813, + "theoretical_loss": 3.3179108901299945, + "tokens_seen": 3033726976 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043434559452523524, + "loss": 2.6586, + "theoretical_loss": 3.317905391851305, + "tokens_seen": 3033792512 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004342386655260907, + "loss": 2.4702, + "theoretical_loss": 3.317899893724644, + "tokens_seen": 3033858048 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004341317365269461, + "loss": 2.6286, + "theoretical_loss": 3.3178943957500033, + "tokens_seen": 3033923584 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043402480752780155, + "loss": 2.6536, + "theoretical_loss": 3.3178888979273764, + "tokens_seen": 3033989120 + }, + { + "epoch": 0.14, + "learning_rate": 0.000433917878528657, + "loss": 2.4538, + "theoretical_loss": 3.317883400256756, + "tokens_seen": 3034054656 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004338109495295124, + "loss": 2.4945, + "theoretical_loss": 3.3178779027381333, + "tokens_seen": 3034120192 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043370402053036786, + "loss": 2.5167, + "theoretical_loss": 3.3178724053715016, + "tokens_seen": 3034185728 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004335970915312233, + "loss": 2.7934, + "theoretical_loss": 3.3178669081568537, + "tokens_seen": 3034251264 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043349016253207875, + "loss": 2.3483, + "theoretical_loss": 3.3178614110941815, + "tokens_seen": 3034316800 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004333832335329341, + "loss": 2.384, + "theoretical_loss": 3.317855914183478, + "tokens_seen": 3034382336 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004332763045337896, + "loss": 2.6092, + "theoretical_loss": 3.3178504174247356, + "tokens_seen": 3034447872 + }, + { + "epoch": 0.14, + "learning_rate": 0.000433169375534645, + "loss": 2.8584, + "theoretical_loss": 3.317844920817947, + "tokens_seen": 3034513408 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004330624465355004, + "loss": 2.6498, + "theoretical_loss": 3.317839424363104, + "tokens_seen": 3034578944 + }, + { + "epoch": 0.14, + "objective/train/docs_used": 1664335, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.460336923599243, + "objective/train/theoretical_loss": 3.3178339280602, + "objective/train/tokens_used": 64695776, + "theoretical_loss": 3.3178339280602, + "tokens_seen": 3034644480 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043295551753635584, + "loss": 2.4474, + "theoretical_loss": 3.3178339280602, + "tokens_seen": 3034644480 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004328485885372113, + "loss": 2.5976, + "theoretical_loss": 3.317828431909227, + "tokens_seen": 3034710016 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043274165953806673, + "loss": 2.2452, + "theoretical_loss": 3.3178229359101774, + "tokens_seen": 3034775552 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043263473053892215, + "loss": 2.5553, + "theoretical_loss": 3.3178174400630445, + "tokens_seen": 3034841088 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004325278015397776, + "loss": 2.4949, + "theoretical_loss": 3.31781194436782, + "tokens_seen": 3034906624 + }, + { + "epoch": 0.14, + "learning_rate": 0.000432420872540633, + "loss": 2.7255, + "theoretical_loss": 3.3178064488244967, + "tokens_seen": 3034972160 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043231394354148846, + "loss": 2.4908, + "theoretical_loss": 3.317800953433067, + "tokens_seen": 3035037696 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004322070145423439, + "loss": 2.7668, + "theoretical_loss": 3.3177954581935234, + "tokens_seen": 3035103232 + }, + { + "epoch": 0.14, + "learning_rate": 0.00043210008554319935, + "loss": 2.6495, + "theoretical_loss": 3.317789963105859, + "tokens_seen": 3035168768 + }, + { + "epoch": 0.14, + "learning_rate": 0.0004319931565440547, + "loss": 2.6374, + "theoretical_loss": 3.317784468170066, + "tokens_seen": 3035234304 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004318862275449102, + "loss": 2.5718, + "theoretical_loss": 3.3177789733861363, + "tokens_seen": 3035299840 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043177929854576567, + "loss": 2.6787, + "theoretical_loss": 3.317773478754063, + "tokens_seen": 3035365376 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043167236954662103, + "loss": 2.6101, + "theoretical_loss": 3.3177679842738383, + "tokens_seen": 3035430912 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004315654405474765, + "loss": 2.7885, + "theoretical_loss": 3.317762489945456, + "tokens_seen": 3035496448 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004314585115483319, + "loss": 2.4024, + "theoretical_loss": 3.3177569957689066, + "tokens_seen": 3035561984 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043135158254918734, + "loss": 2.5602, + "theoretical_loss": 3.3177515017441843, + "tokens_seen": 3035627520 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043124465355004276, + "loss": 2.7748, + "theoretical_loss": 3.3177460078712806, + "tokens_seen": 3035693056 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043113772455089823, + "loss": 2.5992, + "theoretical_loss": 3.3177405141501883, + "tokens_seen": 3035758592 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004310307955517536, + "loss": 2.6069, + "theoretical_loss": 3.3177350205809004, + "tokens_seen": 3035824128 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043092386655260907, + "loss": 2.5898, + "theoretical_loss": 3.317729527163409, + "tokens_seen": 3035889664 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043081693755346454, + "loss": 2.7153, + "theoretical_loss": 3.317724033897706, + "tokens_seen": 3035955200 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043071000855431996, + "loss": 2.6276, + "theoretical_loss": 3.3177185407837855, + "tokens_seen": 3036020736 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004306030795551754, + "loss": 2.5747, + "theoretical_loss": 3.3177130478216386, + "tokens_seen": 3036086272 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004304961505560308, + "loss": 2.921, + "theoretical_loss": 3.3177075550112587, + "tokens_seen": 3036151808 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043038922155688627, + "loss": 2.4716, + "theoretical_loss": 3.3177020623526374, + "tokens_seen": 3036217344 + }, + { + "epoch": 0.15, + "objective/train/docs_used": 1665364, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.702648878097534, + "objective/train/theoretical_loss": 3.3176965698457686, + "objective/train/tokens_used": 66334176, + "theoretical_loss": 3.3176965698457686, + "tokens_seen": 3036282880 + }, + { + "epoch": 0.15, + "learning_rate": 0.00043028229255774163, + "loss": 2.8016, + "theoretical_loss": 3.3176965698457686, + "tokens_seen": 3036282880 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004301753635585971, + "loss": 2.6286, + "theoretical_loss": 3.3176910774906436, + "tokens_seen": 3036348416 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004300684345594525, + "loss": 2.5836, + "theoretical_loss": 3.3176855852872555, + "tokens_seen": 3036413952 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042996150556030794, + "loss": 2.7408, + "theoretical_loss": 3.3176800932355968, + "tokens_seen": 3036479488 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004298545765611634, + "loss": 2.5826, + "theoretical_loss": 3.3176746013356597, + "tokens_seen": 3036545024 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042974764756201884, + "loss": 2.258, + "theoretical_loss": 3.3176691095874373, + "tokens_seen": 3036610560 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042964071856287425, + "loss": 2.5244, + "theoretical_loss": 3.317663617990922, + "tokens_seen": 3036676096 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004295337895637297, + "loss": 2.4518, + "theoretical_loss": 3.317658126546106, + "tokens_seen": 3036741632 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042942686056458515, + "loss": 2.6188, + "theoretical_loss": 3.3176526352529816, + "tokens_seen": 3036807168 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042931993156544056, + "loss": 2.5812, + "theoretical_loss": 3.3176471441115423, + "tokens_seen": 3036872704 + }, + { + "epoch": 0.15, + "learning_rate": 0.000429213002566296, + "loss": 2.5719, + "theoretical_loss": 3.31764165312178, + "tokens_seen": 3036938240 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004291060735671514, + "loss": 2.6047, + "theoretical_loss": 3.317636162283687, + "tokens_seen": 3037003776 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004289991445680069, + "loss": 2.4401, + "theoretical_loss": 3.3176306715972563, + "tokens_seen": 3037069312 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042889221556886224, + "loss": 2.5756, + "theoretical_loss": 3.31762518106248, + "tokens_seen": 3037134848 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004287852865697177, + "loss": 2.5878, + "theoretical_loss": 3.3176196906793516, + "tokens_seen": 3037200384 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004286783575705732, + "loss": 2.5203, + "theoretical_loss": 3.317614200447862, + "tokens_seen": 3037265920 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042857142857142855, + "loss": 2.4597, + "theoretical_loss": 3.3176087103680056, + "tokens_seen": 3037331456 + }, + { + "epoch": 0.15, + "learning_rate": 0.000428464499572284, + "loss": 2.4774, + "theoretical_loss": 3.3176032204397736, + "tokens_seen": 3037396992 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042835757057313944, + "loss": 2.6599, + "theoretical_loss": 3.3175977306631594, + "tokens_seen": 3037462528 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004282506415739949, + "loss": 2.6216, + "theoretical_loss": 3.3175922410381546, + "tokens_seen": 3037528064 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004281437125748503, + "loss": 2.6732, + "theoretical_loss": 3.3175867515647526, + "tokens_seen": 3037593600 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042803678357570575, + "loss": 2.8955, + "theoretical_loss": 3.3175812622429453, + "tokens_seen": 3037659136 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042792985457656117, + "loss": 2.6119, + "theoretical_loss": 3.317575773072726, + "tokens_seen": 3037724672 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004278229255774166, + "loss": 2.4313, + "theoretical_loss": 3.317570284054087, + "tokens_seen": 3037790208 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042771599657827206, + "loss": 2.4777, + "theoretical_loss": 3.3175647951870197, + "tokens_seen": 3037855744 + }, + { + "epoch": 0.15, + "objective/train/docs_used": 1665791, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.440596342086792, + "objective/train/theoretical_loss": 3.317559306471518, + "objective/train/tokens_used": 67972576, + "theoretical_loss": 3.317559306471518, + "tokens_seen": 3037921280 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004276090675791275, + "loss": 2.4501, + "theoretical_loss": 3.317559306471518, + "tokens_seen": 3037921280 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004275021385799829, + "loss": 2.3919, + "theoretical_loss": 3.3175538179075743, + "tokens_seen": 3037986816 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004273952095808383, + "loss": 2.683, + "theoretical_loss": 3.317548329495181, + "tokens_seen": 3038052352 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004272882805816938, + "loss": 2.6477, + "theoretical_loss": 3.31754284123433, + "tokens_seen": 3038117888 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042718135158254915, + "loss": 2.5088, + "theoretical_loss": 3.3175373531250147, + "tokens_seen": 3038183424 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042707442258340463, + "loss": 2.6919, + "theoretical_loss": 3.3175318651672274, + "tokens_seen": 3038248960 + }, + { + "epoch": 0.15, + "learning_rate": 0.00042696749358426005, + "loss": 2.3505, + "theoretical_loss": 3.3175263773609602, + "tokens_seen": 3038314496 + }, + { + "epoch": 0.15, + "learning_rate": 0.0004268605645851155, + "loss": 2.4019, + "theoretical_loss": 3.3175208897062065, + "tokens_seen": 3038380032 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042675363558597094, + "loss": 2.5938, + "theoretical_loss": 3.3175154022029583, + "tokens_seen": 3038445568 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042664670658682636, + "loss": 2.7483, + "theoretical_loss": 3.317509914851208, + "tokens_seen": 3038511104 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042653977758768183, + "loss": 2.5649, + "theoretical_loss": 3.317504427650948, + "tokens_seen": 3038576640 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004264328485885372, + "loss": 2.7034, + "theoretical_loss": 3.3174989406021718, + "tokens_seen": 3038642176 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042632591958939267, + "loss": 2.5488, + "theoretical_loss": 3.3174934537048713, + "tokens_seen": 3038707712 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004262189905902481, + "loss": 2.5863, + "theoretical_loss": 3.317487966959039, + "tokens_seen": 3038773248 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004261120615911035, + "loss": 2.678, + "theoretical_loss": 3.317482480364667, + "tokens_seen": 3038838784 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004260051325919589, + "loss": 2.4831, + "theoretical_loss": 3.3174769939217494, + "tokens_seen": 3038904320 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004258982035928144, + "loss": 2.5811, + "theoretical_loss": 3.3174715076302776, + "tokens_seen": 3038969856 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042579127459366976, + "loss": 2.5055, + "theoretical_loss": 3.317466021490244, + "tokens_seen": 3039035392 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042568434559452523, + "loss": 2.7085, + "theoretical_loss": 3.3174605355016418, + "tokens_seen": 3039100928 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004255774165953807, + "loss": 2.2972, + "theoretical_loss": 3.3174550496644626, + "tokens_seen": 3039166464 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004254704875962361, + "loss": 2.7515, + "theoretical_loss": 3.3174495639787005, + "tokens_seen": 3039232000 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042536355859709154, + "loss": 2.449, + "theoretical_loss": 3.317444078444346, + "tokens_seen": 3039297536 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042525662959794696, + "loss": 2.7037, + "theoretical_loss": 3.317438593061394, + "tokens_seen": 3039363072 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042514970059880243, + "loss": 2.6908, + "theoretical_loss": 3.317433107829835, + "tokens_seen": 3039428608 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004250427715996578, + "loss": 2.461, + "theoretical_loss": 3.3174276227496633, + "tokens_seen": 3039494144 + }, + { + "epoch": 0.16, + "objective/train/docs_used": 1666356, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.657942295074463, + "objective/train/theoretical_loss": 3.31742213782087, + "objective/train/tokens_used": 69610976, + "theoretical_loss": 3.31742213782087, + "tokens_seen": 3039559680 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042493584260051327, + "loss": 2.6056, + "theoretical_loss": 3.31742213782087, + "tokens_seen": 3039559680 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004248289136013687, + "loss": 2.5217, + "theoretical_loss": 3.3174166530434483, + "tokens_seen": 3039625216 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004247219846022241, + "loss": 2.6368, + "theoretical_loss": 3.3174111684173906, + "tokens_seen": 3039690752 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004246150556030796, + "loss": 2.4843, + "theoretical_loss": 3.3174056839426895, + "tokens_seen": 3039756288 + }, + { + "epoch": 0.16, + "learning_rate": 0.000424508126603935, + "loss": 2.671, + "theoretical_loss": 3.317400199619338, + "tokens_seen": 3039821824 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004244011976047904, + "loss": 2.4733, + "theoretical_loss": 3.3173947154473282, + "tokens_seen": 3039887360 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042429426860564584, + "loss": 2.688, + "theoretical_loss": 3.3173892314266524, + "tokens_seen": 3039952896 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004241873396065013, + "loss": 2.6233, + "theoretical_loss": 3.3173837475573036, + "tokens_seen": 3040018432 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042408041060735673, + "loss": 2.6475, + "theoretical_loss": 3.3173782638392746, + "tokens_seen": 3040083968 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042397348160821215, + "loss": 2.635, + "theoretical_loss": 3.3173727802725574, + "tokens_seen": 3040149504 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042386655260906757, + "loss": 2.5467, + "theoretical_loss": 3.3173672968571446, + "tokens_seen": 3040215040 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042375962360992304, + "loss": 2.5554, + "theoretical_loss": 3.3173618135930294, + "tokens_seen": 3040280576 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042365269461077846, + "loss": 2.5479, + "theoretical_loss": 3.3173563304802034, + "tokens_seen": 3040346112 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004235457656116339, + "loss": 2.4553, + "theoretical_loss": 3.31735084751866, + "tokens_seen": 3040411648 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042343883661248935, + "loss": 2.6328, + "theoretical_loss": 3.3173453647083915, + "tokens_seen": 3040477184 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004233319076133447, + "loss": 2.6439, + "theoretical_loss": 3.31733988204939, + "tokens_seen": 3040542720 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004232249786142002, + "loss": 2.7004, + "theoretical_loss": 3.317334399541649, + "tokens_seen": 3040608256 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004231180496150556, + "loss": 2.3918, + "theoretical_loss": 3.31732891718516, + "tokens_seen": 3040673792 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004230111206159111, + "loss": 2.7061, + "theoretical_loss": 3.3173234349799166, + "tokens_seen": 3040739328 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042290419161676644, + "loss": 2.5726, + "theoretical_loss": 3.3173179529259107, + "tokens_seen": 3040804864 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004227972626176219, + "loss": 2.3567, + "theoretical_loss": 3.317312471023135, + "tokens_seen": 3040870400 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004226903336184774, + "loss": 2.4497, + "theoretical_loss": 3.3173069892715823, + "tokens_seen": 3040935936 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042258340461933275, + "loss": 2.5488, + "theoretical_loss": 3.3173015076712447, + "tokens_seen": 3041001472 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004224764756201882, + "loss": 2.5573, + "theoretical_loss": 3.3172960262221154, + "tokens_seen": 3041067008 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042236954662104364, + "loss": 2.5725, + "theoretical_loss": 3.3172905449241865, + "tokens_seen": 3041132544 + }, + { + "epoch": 0.16, + "objective/train/docs_used": 1667402, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.76525616645813, + "objective/train/theoretical_loss": 3.317285063777451, + "objective/train/tokens_used": 71249376, + "theoretical_loss": 3.317285063777451, + "tokens_seen": 3041198080 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042226261762189906, + "loss": 2.7626, + "theoretical_loss": 3.317285063777451, + "tokens_seen": 3041198080 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004221556886227545, + "loss": 2.5848, + "theoretical_loss": 3.3172795827819, + "tokens_seen": 3041263616 + }, + { + "epoch": 0.16, + "learning_rate": 0.00042204875962360995, + "loss": 2.6828, + "theoretical_loss": 3.3172741019375285, + "tokens_seen": 3041329152 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004219418306244653, + "loss": 2.4074, + "theoretical_loss": 3.3172686212443274, + "tokens_seen": 3041394688 + }, + { + "epoch": 0.16, + "learning_rate": 0.0004218349016253208, + "loss": 2.8892, + "theoretical_loss": 3.3172631407022894, + "tokens_seen": 3041460224 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042172797262617626, + "loss": 2.7005, + "theoretical_loss": 3.317257660311408, + "tokens_seen": 3041525760 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004216210436270317, + "loss": 2.559, + "theoretical_loss": 3.3172521800716748, + "tokens_seen": 3041591296 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004215141146278871, + "loss": 2.6013, + "theoretical_loss": 3.3172466999830825, + "tokens_seen": 3041656832 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004214071856287425, + "loss": 2.6889, + "theoretical_loss": 3.317241220045624, + "tokens_seen": 3041722368 + }, + { + "epoch": 0.17, + "learning_rate": 0.000421300256629598, + "loss": 2.7481, + "theoretical_loss": 3.317235740259292, + "tokens_seen": 3041787904 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042119332763045336, + "loss": 2.8443, + "theoretical_loss": 3.3172302606240787, + "tokens_seen": 3041853440 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042108639863130883, + "loss": 2.608, + "theoretical_loss": 3.3172247811399767, + "tokens_seen": 3041918976 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042097946963216425, + "loss": 2.6413, + "theoretical_loss": 3.3172193018069787, + "tokens_seen": 3041984512 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042087254063301967, + "loss": 2.7409, + "theoretical_loss": 3.317213822625077, + "tokens_seen": 3042050048 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004207656116338751, + "loss": 2.5192, + "theoretical_loss": 3.317208343594265, + "tokens_seen": 3042115584 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042065868263473056, + "loss": 2.5256, + "theoretical_loss": 3.3172028647145346, + "tokens_seen": 3042181120 + }, + { + "epoch": 0.17, + "learning_rate": 0.000420551753635586, + "loss": 2.9425, + "theoretical_loss": 3.3171973859858785, + "tokens_seen": 3042246656 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004204448246364414, + "loss": 2.6551, + "theoretical_loss": 3.317191907408289, + "tokens_seen": 3042312192 + }, + { + "epoch": 0.17, + "learning_rate": 0.00042033789563729687, + "loss": 2.5016, + "theoretical_loss": 3.3171864289817594, + "tokens_seen": 3042377728 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004202309666381523, + "loss": 2.4924, + "theoretical_loss": 3.3171809507062817, + "tokens_seen": 3042443264 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004201240376390077, + "loss": 2.4651, + "theoretical_loss": 3.3171754725818485, + "tokens_seen": 3042508800 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004200171086398631, + "loss": 2.6467, + "theoretical_loss": 3.3171699946084523, + "tokens_seen": 3042574336 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004199101796407186, + "loss": 2.7134, + "theoretical_loss": 3.3171645167860864, + "tokens_seen": 3042639872 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041980325064157396, + "loss": 2.5778, + "theoretical_loss": 3.3171590391147427, + "tokens_seen": 3042705408 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041969632164242943, + "loss": 2.6097, + "theoretical_loss": 3.317153561594414, + "tokens_seen": 3042770944 + }, + { + "epoch": 0.17, + "objective/train/docs_used": 1668521, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.784074306488037, + "objective/train/theoretical_loss": 3.3171480842250927, + "objective/train/tokens_used": 72887776, + "theoretical_loss": 3.3171480842250927, + "tokens_seen": 3042836480 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004195893926432849, + "loss": 2.4201, + "theoretical_loss": 3.3171480842250927, + "tokens_seen": 3042836480 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041948246364414027, + "loss": 2.4407, + "theoretical_loss": 3.3171426070067715, + "tokens_seen": 3042902016 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041937553464499574, + "loss": 2.4949, + "theoretical_loss": 3.317137129939443, + "tokens_seen": 3042967552 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041926860564585116, + "loss": 2.5337, + "theoretical_loss": 3.3171316530231003, + "tokens_seen": 3043033088 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004191616766467066, + "loss": 2.7346, + "theoretical_loss": 3.3171261762577346, + "tokens_seen": 3043098624 + }, + { + "epoch": 0.17, + "learning_rate": 0.000419054747647562, + "loss": 2.6234, + "theoretical_loss": 3.31712069964334, + "tokens_seen": 3043164160 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004189478186484175, + "loss": 2.7589, + "theoretical_loss": 3.3171152231799086, + "tokens_seen": 3043229696 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004188408896492729, + "loss": 2.5401, + "theoretical_loss": 3.317109746867432, + "tokens_seen": 3043295232 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004187339606501283, + "loss": 2.59, + "theoretical_loss": 3.317104270705905, + "tokens_seen": 3043360768 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004186270316509838, + "loss": 2.7252, + "theoretical_loss": 3.3170987946953177, + "tokens_seen": 3043426304 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004185201026518392, + "loss": 2.6156, + "theoretical_loss": 3.3170933188356644, + "tokens_seen": 3043491840 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004184131736526946, + "loss": 2.7091, + "theoretical_loss": 3.3170878431269366, + "tokens_seen": 3043557376 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041830624465355004, + "loss": 2.6037, + "theoretical_loss": 3.3170823675691277, + "tokens_seen": 3043622912 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004181993156544055, + "loss": 2.4912, + "theoretical_loss": 3.31707689216223, + "tokens_seen": 3043688448 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004180923866552609, + "loss": 2.6153, + "theoretical_loss": 3.317071416906236, + "tokens_seen": 3043753984 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041798545765611635, + "loss": 2.6945, + "theoretical_loss": 3.317065941801139, + "tokens_seen": 3043819520 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041787852865697177, + "loss": 2.6036, + "theoretical_loss": 3.3170604668469297, + "tokens_seen": 3043885056 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004177715996578272, + "loss": 2.6957, + "theoretical_loss": 3.317054992043603, + "tokens_seen": 3043950592 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041766467065868266, + "loss": 2.6709, + "theoretical_loss": 3.31704951739115, + "tokens_seen": 3044016128 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004175577416595381, + "loss": 2.7614, + "theoretical_loss": 3.317044042889564, + "tokens_seen": 3044081664 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041745081266039355, + "loss": 2.4733, + "theoretical_loss": 3.317038568538837, + "tokens_seen": 3044147200 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004173438836612489, + "loss": 2.5863, + "theoretical_loss": 3.317033094338962, + "tokens_seen": 3044212736 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004172369546621044, + "loss": 2.565, + "theoretical_loss": 3.317027620289932, + "tokens_seen": 3044278272 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004171300256629598, + "loss": 2.7164, + "theoretical_loss": 3.3170221463917384, + "tokens_seen": 3044343808 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004170230966638152, + "loss": 2.6085, + "theoretical_loss": 3.317016672644375, + "tokens_seen": 3044409344 + }, + { + "epoch": 0.17, + "objective/train/docs_used": 1668971, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.9621496200561523, + "objective/train/theoretical_loss": 3.3170111990478337, + "objective/train/tokens_used": 74526176, + "theoretical_loss": 3.3170111990478337, + "tokens_seen": 3044474880 + }, + { + "epoch": 0.17, + "learning_rate": 0.00041691616766467064, + "loss": 2.9867, + "theoretical_loss": 3.3170111990478337, + "tokens_seen": 3044474880 + }, + { + "epoch": 0.17, + "learning_rate": 0.0004168092386655261, + "loss": 2.6062, + "theoretical_loss": 3.3170057256021077, + "tokens_seen": 3044540416 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004167023096663815, + "loss": 2.5778, + "theoretical_loss": 3.3170002523071886, + "tokens_seen": 3044605952 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041659538066723695, + "loss": 2.5364, + "theoretical_loss": 3.3169947791630703, + "tokens_seen": 3044671488 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041648845166809243, + "loss": 2.4894, + "theoretical_loss": 3.316989306169744, + "tokens_seen": 3044737024 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041638152266894785, + "loss": 2.7874, + "theoretical_loss": 3.3169838333272037, + "tokens_seen": 3044802560 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041627459366980326, + "loss": 2.6971, + "theoretical_loss": 3.3169783606354413, + "tokens_seen": 3044868096 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004161676646706587, + "loss": 2.7493, + "theoretical_loss": 3.316972888094449, + "tokens_seen": 3044933632 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041606073567151416, + "loss": 2.6092, + "theoretical_loss": 3.3169674157042195, + "tokens_seen": 3044999168 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004159538066723695, + "loss": 2.4757, + "theoretical_loss": 3.3169619434647464, + "tokens_seen": 3045064704 + }, + { + "epoch": 0.18, + "learning_rate": 0.000415846877673225, + "loss": 2.7305, + "theoretical_loss": 3.3169564713760216, + "tokens_seen": 3045130240 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004157399486740804, + "loss": 2.5703, + "theoretical_loss": 3.3169509994380375, + "tokens_seen": 3045195776 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041563301967493583, + "loss": 2.5121, + "theoretical_loss": 3.3169455276507867, + "tokens_seen": 3045261312 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004155260906757913, + "loss": 2.6273, + "theoretical_loss": 3.3169400560142623, + "tokens_seen": 3045326848 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004154191616766467, + "loss": 2.6654, + "theoretical_loss": 3.3169345845284566, + "tokens_seen": 3045392384 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041531223267750214, + "loss": 2.5341, + "theoretical_loss": 3.3169291131933623, + "tokens_seen": 3045457920 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041520530367835756, + "loss": 2.5806, + "theoretical_loss": 3.316923642008972, + "tokens_seen": 3045523456 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041509837467921303, + "loss": 2.598, + "theoretical_loss": 3.316918170975278, + "tokens_seen": 3045588992 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041499144568006845, + "loss": 2.6477, + "theoretical_loss": 3.3169127000922733, + "tokens_seen": 3045654528 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041488451668092387, + "loss": 2.6531, + "theoretical_loss": 3.31690722935995, + "tokens_seen": 3045720064 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004147775876817793, + "loss": 2.4507, + "theoretical_loss": 3.316901758778302, + "tokens_seen": 3045785600 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041467065868263476, + "loss": 2.5672, + "theoretical_loss": 3.3168962883473205, + "tokens_seen": 3045851136 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004145637296834902, + "loss": 2.5622, + "theoretical_loss": 3.3168908180669985, + "tokens_seen": 3045916672 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004144568006843456, + "loss": 2.7632, + "theoretical_loss": 3.316885347937329, + "tokens_seen": 3045982208 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041434987168520107, + "loss": 2.7035, + "theoretical_loss": 3.3168798779583035, + "tokens_seen": 3046047744 + }, + { + "epoch": 0.18, + "objective/train/docs_used": 1670023, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6770851612091064, + "objective/train/theoretical_loss": 3.316874408129916, + "objective/train/tokens_used": 76164576, + "theoretical_loss": 3.316874408129916, + "tokens_seen": 3046113280 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041424294268605644, + "loss": 2.6328, + "theoretical_loss": 3.316874408129916, + "tokens_seen": 3046113280 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004141360136869119, + "loss": 2.5711, + "theoretical_loss": 3.3168689384521586, + "tokens_seen": 3046178816 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004140290846877673, + "loss": 2.5709, + "theoretical_loss": 3.316863468925024, + "tokens_seen": 3046244352 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041392215568862275, + "loss": 2.5086, + "theoretical_loss": 3.316857999548504, + "tokens_seen": 3046309888 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041381522668947816, + "loss": 2.7012, + "theoretical_loss": 3.3168525303225924, + "tokens_seen": 3046375424 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041370829769033364, + "loss": 2.7311, + "theoretical_loss": 3.316847061247281, + "tokens_seen": 3046440960 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004136013686911891, + "loss": 2.4887, + "theoretical_loss": 3.316841592322563, + "tokens_seen": 3046506496 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004134944396920445, + "loss": 2.6398, + "theoretical_loss": 3.3168361235484305, + "tokens_seen": 3046572032 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041338751069289995, + "loss": 2.8039, + "theoretical_loss": 3.3168306549248765, + "tokens_seen": 3046637568 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041328058169375537, + "loss": 2.6058, + "theoretical_loss": 3.3168251864518936, + "tokens_seen": 3046703104 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004131736526946108, + "loss": 2.6846, + "theoretical_loss": 3.316819718129474, + "tokens_seen": 3046768640 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004130667236954662, + "loss": 2.6162, + "theoretical_loss": 3.31681424995761, + "tokens_seen": 3046834176 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004129597946963217, + "loss": 2.5892, + "theoretical_loss": 3.3168087819362957, + "tokens_seen": 3046899712 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041285286569717704, + "loss": 2.4469, + "theoretical_loss": 3.316803314065522, + "tokens_seen": 3046965248 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004127459366980325, + "loss": 2.6708, + "theoretical_loss": 3.316797846345283, + "tokens_seen": 3047030784 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041263900769888793, + "loss": 2.6839, + "theoretical_loss": 3.3167923787755704, + "tokens_seen": 3047096320 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041253207869974335, + "loss": 2.7111, + "theoretical_loss": 3.316786911356377, + "tokens_seen": 3047161856 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004124251497005988, + "loss": 2.5143, + "theoretical_loss": 3.3167814440876957, + "tokens_seen": 3047227392 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041231822070145424, + "loss": 2.4868, + "theoretical_loss": 3.316775976969519, + "tokens_seen": 3047292928 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004122112917023097, + "loss": 2.394, + "theoretical_loss": 3.316770510001839, + "tokens_seen": 3047358464 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004121043627031651, + "loss": 2.6288, + "theoretical_loss": 3.316765043184649, + "tokens_seen": 3047424000 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041199743370402055, + "loss": 2.6458, + "theoretical_loss": 3.316759576517941, + "tokens_seen": 3047489536 + }, + { + "epoch": 0.18, + "learning_rate": 0.00041189050470487597, + "loss": 2.4446, + "theoretical_loss": 3.316754110001708, + "tokens_seen": 3047555072 + }, + { + "epoch": 0.18, + "learning_rate": 0.0004117835757057314, + "loss": 2.6992, + "theoretical_loss": 3.3167486436359432, + "tokens_seen": 3047620608 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004116766467065868, + "loss": 2.7639, + "theoretical_loss": 3.3167431774206384, + "tokens_seen": 3047686144 + }, + { + "epoch": 0.19, + "objective/train/docs_used": 1670610, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5403025150299072, + "objective/train/theoretical_loss": 3.316737711355786, + "objective/train/tokens_used": 77802976, + "theoretical_loss": 3.316737711355786, + "tokens_seen": 3047751680 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004115697177074423, + "loss": 2.6853, + "theoretical_loss": 3.316737711355786, + "tokens_seen": 3047751680 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004114627887082977, + "loss": 2.7335, + "theoretical_loss": 3.3167322454413792, + "tokens_seen": 3047817216 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004113558597091531, + "loss": 2.6713, + "theoretical_loss": 3.3167267796774107, + "tokens_seen": 3047882752 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004112489307100086, + "loss": 2.8081, + "theoretical_loss": 3.316721314063873, + "tokens_seen": 3047948288 + }, + { + "epoch": 0.19, + "learning_rate": 0.000411142001710864, + "loss": 2.5437, + "theoretical_loss": 3.316715848600759, + "tokens_seen": 3048013824 + }, + { + "epoch": 0.19, + "learning_rate": 0.00041103507271171943, + "loss": 2.6631, + "theoretical_loss": 3.3167103832880604, + "tokens_seen": 3048079360 + }, + { + "epoch": 0.19, + "learning_rate": 0.00041092814371257485, + "loss": 2.4265, + "theoretical_loss": 3.3167049181257706, + "tokens_seen": 3048144896 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004108212147134303, + "loss": 2.4767, + "theoretical_loss": 3.316699453113882, + "tokens_seen": 3048210432 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004107142857142857, + "loss": 2.702, + "theoretical_loss": 3.316693988252387, + "tokens_seen": 3048275968 + }, + { + "epoch": 0.19, + "learning_rate": 0.00041060735671514116, + "loss": 2.5563, + "theoretical_loss": 3.3166885235412784, + "tokens_seen": 3048341504 + }, + { + "epoch": 0.19, + "learning_rate": 0.00041050042771599663, + "loss": 2.7919, + "theoretical_loss": 3.3166830589805496, + "tokens_seen": 3048407040 + }, + { + "epoch": 0.19, + "learning_rate": 0.000410393498716852, + "loss": 2.5336, + "theoretical_loss": 3.316677594570192, + "tokens_seen": 3048472576 + }, + { + "epoch": 0.19, + "learning_rate": 0.00041028656971770747, + "loss": 2.6564, + "theoretical_loss": 3.3166721303101987, + "tokens_seen": 3048538112 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004101796407185629, + "loss": 2.577, + "theoretical_loss": 3.316666666200563, + "tokens_seen": 3048603648 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004100727117194183, + "loss": 2.5261, + "theoretical_loss": 3.316661202241276, + "tokens_seen": 3048669184 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004099657827202737, + "loss": 2.6642, + "theoretical_loss": 3.316655738432332, + "tokens_seen": 3048734720 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004098588537211292, + "loss": 2.6357, + "theoretical_loss": 3.316650274773723, + "tokens_seen": 3048800256 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004097519247219846, + "loss": 2.7002, + "theoretical_loss": 3.3166448112654408, + "tokens_seen": 3048865792 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040964499572284003, + "loss": 2.5809, + "theoretical_loss": 3.316639347907479, + "tokens_seen": 3048931328 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004095380667236955, + "loss": 2.7383, + "theoretical_loss": 3.3166338846998302, + "tokens_seen": 3048996864 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004094311377245509, + "loss": 2.7608, + "theoretical_loss": 3.316628421642487, + "tokens_seen": 3049062400 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040932420872540634, + "loss": 2.4564, + "theoretical_loss": 3.316622958735442, + "tokens_seen": 3049127936 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040921727972626176, + "loss": 2.5476, + "theoretical_loss": 3.316617495978687, + "tokens_seen": 3049193472 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040911035072711723, + "loss": 2.5533, + "theoretical_loss": 3.3166120333722158, + "tokens_seen": 3049259008 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004090034217279726, + "loss": 2.75, + "theoretical_loss": 3.3166065709160204, + "tokens_seen": 3049324544 + }, + { + "epoch": 0.19, + "objective/train/docs_used": 1671708, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5090513229370117, + "objective/train/theoretical_loss": 3.3166011086100937, + "objective/train/tokens_used": 79441376, + "theoretical_loss": 3.3166011086100937, + "tokens_seen": 3049390080 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040889649272882807, + "loss": 2.6123, + "theoretical_loss": 3.3166011086100937, + "tokens_seen": 3049390080 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004087895637296835, + "loss": 2.4582, + "theoretical_loss": 3.316595646454428, + "tokens_seen": 3049455616 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004086826347305389, + "loss": 2.4455, + "theoretical_loss": 3.3165901844490167, + "tokens_seen": 3049521152 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040857570573139433, + "loss": 2.6832, + "theoretical_loss": 3.316584722593851, + "tokens_seen": 3049586688 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004084687767322498, + "loss": 2.8326, + "theoretical_loss": 3.3165792608889255, + "tokens_seen": 3049652224 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004083618477331053, + "loss": 2.6351, + "theoretical_loss": 3.3165737993342312, + "tokens_seen": 3049717760 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040825491873396064, + "loss": 2.5201, + "theoretical_loss": 3.3165683379297612, + "tokens_seen": 3049783296 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004081479897348161, + "loss": 2.6653, + "theoretical_loss": 3.3165628766755084, + "tokens_seen": 3049848832 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040804106073567153, + "loss": 2.715, + "theoretical_loss": 3.3165574155714657, + "tokens_seen": 3049914368 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040793413173652695, + "loss": 2.5546, + "theoretical_loss": 3.316551954617625, + "tokens_seen": 3049979904 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040782720273738237, + "loss": 2.7839, + "theoretical_loss": 3.3165464938139797, + "tokens_seen": 3050045440 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040772027373823784, + "loss": 2.4923, + "theoretical_loss": 3.3165410331605214, + "tokens_seen": 3050110976 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004076133447390932, + "loss": 2.6646, + "theoretical_loss": 3.3165355726572434, + "tokens_seen": 3050176512 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004075064157399487, + "loss": 2.5978, + "theoretical_loss": 3.316530112304139, + "tokens_seen": 3050242048 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040739948674080415, + "loss": 2.8643, + "theoretical_loss": 3.3165246521011995, + "tokens_seen": 3050307584 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004072925577416595, + "loss": 2.3827, + "theoretical_loss": 3.3165191920484185, + "tokens_seen": 3050373120 + }, + { + "epoch": 0.19, + "learning_rate": 0.000407185628742515, + "loss": 2.6466, + "theoretical_loss": 3.3165137321457885, + "tokens_seen": 3050438656 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004070786997433704, + "loss": 2.5199, + "theoretical_loss": 3.316508272393302, + "tokens_seen": 3050504192 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004069717707442259, + "loss": 2.5219, + "theoretical_loss": 3.3165028127909513, + "tokens_seen": 3050569728 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040686484174508124, + "loss": 2.6259, + "theoretical_loss": 3.3164973533387294, + "tokens_seen": 3050635264 + }, + { + "epoch": 0.19, + "learning_rate": 0.0004067579127459367, + "loss": 2.607, + "theoretical_loss": 3.3164918940366293, + "tokens_seen": 3050700800 + }, + { + "epoch": 0.19, + "learning_rate": 0.00040665098374679213, + "loss": 2.7539, + "theoretical_loss": 3.3164864348846432, + "tokens_seen": 3050766336 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040654405474764755, + "loss": 2.704, + "theoretical_loss": 3.3164809758827634, + "tokens_seen": 3050831872 + }, + { + "epoch": 0.2, + "learning_rate": 0.000406437125748503, + "loss": 2.7537, + "theoretical_loss": 3.3164755170309834, + "tokens_seen": 3050897408 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040633019674935844, + "loss": 2.7319, + "theoretical_loss": 3.3164700583292954, + "tokens_seen": 3050962944 + }, + { + "epoch": 0.2, + "objective/train/docs_used": 1672113, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8296430110931396, + "objective/train/theoretical_loss": 3.316464599777692, + "objective/train/tokens_used": 81079776, + "theoretical_loss": 3.316464599777692, + "tokens_seen": 3051028480 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040622326775021386, + "loss": 2.5403, + "theoretical_loss": 3.316464599777692, + "tokens_seen": 3051028480 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004061163387510693, + "loss": 2.7452, + "theoretical_loss": 3.316459141376166, + "tokens_seen": 3051094016 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040600940975192475, + "loss": 2.7405, + "theoretical_loss": 3.31645368312471, + "tokens_seen": 3051159552 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004059024807527802, + "loss": 2.7306, + "theoretical_loss": 3.3164482250233163, + "tokens_seen": 3051225088 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004057955517536356, + "loss": 2.5532, + "theoretical_loss": 3.3164427670719783, + "tokens_seen": 3051290624 + }, + { + "epoch": 0.2, + "learning_rate": 0.000405688622754491, + "loss": 2.5735, + "theoretical_loss": 3.316437309270688, + "tokens_seen": 3051356160 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004055816937553465, + "loss": 2.9272, + "theoretical_loss": 3.316431851619438, + "tokens_seen": 3051421696 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040547476475620185, + "loss": 2.5735, + "theoretical_loss": 3.316426394118222, + "tokens_seen": 3051487232 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004053678357570573, + "loss": 2.762, + "theoretical_loss": 3.316420936767031, + "tokens_seen": 3051552768 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004052609067579128, + "loss": 2.5567, + "theoretical_loss": 3.316415479565859, + "tokens_seen": 3051618304 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040515397775876816, + "loss": 2.7632, + "theoretical_loss": 3.3164100225146984, + "tokens_seen": 3051683840 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040504704875962363, + "loss": 2.6272, + "theoretical_loss": 3.3164045656135417, + "tokens_seen": 3051749376 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040494011976047905, + "loss": 2.6097, + "theoretical_loss": 3.316399108862381, + "tokens_seen": 3051814912 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040483319076133447, + "loss": 2.6183, + "theoretical_loss": 3.3163936522612096, + "tokens_seen": 3051880448 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004047262617621899, + "loss": 2.6047, + "theoretical_loss": 3.3163881958100205, + "tokens_seen": 3051945984 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040461933276304536, + "loss": 2.4767, + "theoretical_loss": 3.3163827395088052, + "tokens_seen": 3052011520 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004045124037639008, + "loss": 2.8153, + "theoretical_loss": 3.3163772833575575, + "tokens_seen": 3052077056 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004044054747647562, + "loss": 2.6842, + "theoretical_loss": 3.3163718273562695, + "tokens_seen": 3052142592 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040429854576561167, + "loss": 2.6261, + "theoretical_loss": 3.316366371504934, + "tokens_seen": 3052208128 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004041916167664671, + "loss": 2.4601, + "theoretical_loss": 3.3163609158035436, + "tokens_seen": 3052273664 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004040846877673225, + "loss": 2.4056, + "theoretical_loss": 3.316355460252091, + "tokens_seen": 3052339200 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004039777587681779, + "loss": 2.6035, + "theoretical_loss": 3.3163500048505687, + "tokens_seen": 3052404736 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004038708297690334, + "loss": 2.5094, + "theoretical_loss": 3.3163445495989694, + "tokens_seen": 3052470272 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040376390076988876, + "loss": 2.7304, + "theoretical_loss": 3.3163390944972857, + "tokens_seen": 3052535808 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040365697177074424, + "loss": 2.4984, + "theoretical_loss": 3.316333639545511, + "tokens_seen": 3052601344 + }, + { + "epoch": 0.2, + "objective/train/docs_used": 1673312, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1437244415283203, + "objective/train/theoretical_loss": 3.316328184743637, + "objective/train/tokens_used": 82718176, + "theoretical_loss": 3.316328184743637, + "tokens_seen": 3052666880 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040355004277159965, + "loss": 2.4723, + "theoretical_loss": 3.316328184743637, + "tokens_seen": 3052666880 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004034431137724551, + "loss": 2.5947, + "theoretical_loss": 3.316322730091657, + "tokens_seen": 3052732416 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040333618477331055, + "loss": 2.6962, + "theoretical_loss": 3.3163172755895634, + "tokens_seen": 3052797952 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040322925577416596, + "loss": 2.6721, + "theoretical_loss": 3.3163118212373486, + "tokens_seen": 3052863488 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040312232677502144, + "loss": 2.8194, + "theoretical_loss": 3.3163063670350055, + "tokens_seen": 3052929024 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004030153977758768, + "loss": 2.4612, + "theoretical_loss": 3.316300912982527, + "tokens_seen": 3052994560 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004029084687767323, + "loss": 2.7791, + "theoretical_loss": 3.3162954590799054, + "tokens_seen": 3053060096 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004028015397775877, + "loss": 2.5944, + "theoretical_loss": 3.316290005327134, + "tokens_seen": 3053125632 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004026946107784431, + "loss": 2.5546, + "theoretical_loss": 3.316284551724204, + "tokens_seen": 3053191168 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040258768177929853, + "loss": 2.6204, + "theoretical_loss": 3.31627909827111, + "tokens_seen": 3053256704 + }, + { + "epoch": 0.2, + "learning_rate": 0.000402480752780154, + "loss": 2.5182, + "theoretical_loss": 3.3162736449678434, + "tokens_seen": 3053322240 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004023738237810094, + "loss": 2.6924, + "theoretical_loss": 3.3162681918143972, + "tokens_seen": 3053387776 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040226689478186484, + "loss": 2.6499, + "theoretical_loss": 3.3162627388107637, + "tokens_seen": 3053453312 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004021599657827203, + "loss": 2.6051, + "theoretical_loss": 3.3162572859569366, + "tokens_seen": 3053518848 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004020530367835757, + "loss": 2.7286, + "theoretical_loss": 3.316251833252908, + "tokens_seen": 3053584384 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040194610778443115, + "loss": 2.5385, + "theoretical_loss": 3.3162463806986695, + "tokens_seen": 3053649920 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040183917878528657, + "loss": 2.5242, + "theoretical_loss": 3.3162409282942154, + "tokens_seen": 3053715456 + }, + { + "epoch": 0.2, + "learning_rate": 0.00040173224978614204, + "loss": 2.5566, + "theoretical_loss": 3.316235476039538, + "tokens_seen": 3053780992 + }, + { + "epoch": 0.2, + "learning_rate": 0.0004016253207869974, + "loss": 2.6922, + "theoretical_loss": 3.316230023934629, + "tokens_seen": 3053846528 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004015183917878529, + "loss": 2.6956, + "theoretical_loss": 3.316224571979482, + "tokens_seen": 3053912064 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040141146278870835, + "loss": 2.6225, + "theoretical_loss": 3.3162191201740896, + "tokens_seen": 3053977600 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004013045337895637, + "loss": 2.5757, + "theoretical_loss": 3.3162136685184445, + "tokens_seen": 3054043136 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004011976047904192, + "loss": 2.5213, + "theoretical_loss": 3.3162082170125387, + "tokens_seen": 3054108672 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004010906757912746, + "loss": 2.6254, + "theoretical_loss": 3.3162027656563655, + "tokens_seen": 3054174208 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040098374679213, + "loss": 2.4652, + "theoretical_loss": 3.3161973144499175, + "tokens_seen": 3054239744 + }, + { + "epoch": 0.21, + "objective/train/docs_used": 1674056, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5389561653137207, + "objective/train/theoretical_loss": 3.316191863393187, + "objective/train/tokens_used": 84356576, + "theoretical_loss": 3.316191863393187, + "tokens_seen": 3054305280 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040087681779298545, + "loss": 2.6531, + "theoretical_loss": 3.316191863393187, + "tokens_seen": 3054305280 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004007698887938409, + "loss": 2.8476, + "theoretical_loss": 3.3161864124861675, + "tokens_seen": 3054370816 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004006629597946963, + "loss": 2.4375, + "theoretical_loss": 3.316180961728851, + "tokens_seen": 3054436352 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040055603079555176, + "loss": 2.806, + "theoretical_loss": 3.31617551112123, + "tokens_seen": 3054501888 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004004491017964072, + "loss": 2.4958, + "theoretical_loss": 3.316170060663298, + "tokens_seen": 3054567424 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040034217279726265, + "loss": 2.741, + "theoretical_loss": 3.316164610355047, + "tokens_seen": 3054632960 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040023524379811807, + "loss": 2.5739, + "theoretical_loss": 3.3161591601964697, + "tokens_seen": 3054698496 + }, + { + "epoch": 0.21, + "learning_rate": 0.0004001283147989735, + "loss": 2.5663, + "theoretical_loss": 3.316153710187559, + "tokens_seen": 3054764032 + }, + { + "epoch": 0.21, + "learning_rate": 0.00040002138579982896, + "loss": 2.8328, + "theoretical_loss": 3.3161482603283075, + "tokens_seen": 3054829568 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003999144568006843, + "loss": 2.5696, + "theoretical_loss": 3.316142810618708, + "tokens_seen": 3054895104 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003998075278015398, + "loss": 2.6351, + "theoretical_loss": 3.316137361058753, + "tokens_seen": 3054960640 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003997005988023952, + "loss": 2.6533, + "theoretical_loss": 3.3161319116484353, + "tokens_seen": 3055026176 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039959366980325063, + "loss": 2.9423, + "theoretical_loss": 3.3161264623877473, + "tokens_seen": 3055091712 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039948674080410605, + "loss": 2.6148, + "theoretical_loss": 3.3161210132766823, + "tokens_seen": 3055157248 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003993798118049615, + "loss": 2.6636, + "theoretical_loss": 3.3161155643152327, + "tokens_seen": 3055222784 + }, + { + "epoch": 0.21, + "learning_rate": 0.000399272882805817, + "loss": 2.8378, + "theoretical_loss": 3.316110115503391, + "tokens_seen": 3055288320 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039916595380667236, + "loss": 2.5724, + "theoretical_loss": 3.3161046668411496, + "tokens_seen": 3055353856 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039905902480752783, + "loss": 2.7957, + "theoretical_loss": 3.316099218328502, + "tokens_seen": 3055419392 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039895209580838325, + "loss": 2.6997, + "theoretical_loss": 3.3160937699654403, + "tokens_seen": 3055484928 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039884516680923867, + "loss": 2.6501, + "theoretical_loss": 3.3160883217519572, + "tokens_seen": 3055550464 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003987382378100941, + "loss": 2.7211, + "theoretical_loss": 3.316082873688045, + "tokens_seen": 3055616000 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039863130881094956, + "loss": 2.6697, + "theoretical_loss": 3.316077425773698, + "tokens_seen": 3055681536 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003985243798118049, + "loss": 2.5528, + "theoretical_loss": 3.316071978008907, + "tokens_seen": 3055747072 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003984174508126604, + "loss": 2.7356, + "theoretical_loss": 3.316066530393666, + "tokens_seen": 3055812608 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039831052181351587, + "loss": 2.5541, + "theoretical_loss": 3.316061082927967, + "tokens_seen": 3055878144 + }, + { + "epoch": 0.21, + "objective/train/docs_used": 1674689, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.368884563446045, + "objective/train/theoretical_loss": 3.3160556356118027, + "objective/train/tokens_used": 85994976, + "theoretical_loss": 3.3160556356118027, + "tokens_seen": 3055943680 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039820359281437124, + "loss": 2.7249, + "theoretical_loss": 3.3160556356118027, + "tokens_seen": 3055943680 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003980966638152267, + "loss": 2.8312, + "theoretical_loss": 3.316050188445166, + "tokens_seen": 3056009216 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039798973481608213, + "loss": 2.7138, + "theoretical_loss": 3.3160447414280494, + "tokens_seen": 3056074752 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003978828058169376, + "loss": 2.6114, + "theoretical_loss": 3.3160392945604458, + "tokens_seen": 3056140288 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039777587681779297, + "loss": 2.7356, + "theoretical_loss": 3.316033847842348, + "tokens_seen": 3056205824 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039766894781864844, + "loss": 2.6253, + "theoretical_loss": 3.316028401273748, + "tokens_seen": 3056271360 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039756201881950386, + "loss": 2.4828, + "theoretical_loss": 3.3160229548546396, + "tokens_seen": 3056336896 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003974550898203593, + "loss": 2.7273, + "theoretical_loss": 3.3160175085850145, + "tokens_seen": 3056402432 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039734816082121475, + "loss": 2.7228, + "theoretical_loss": 3.316012062464866, + "tokens_seen": 3056467968 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039724123182207017, + "loss": 2.528, + "theoretical_loss": 3.316006616494186, + "tokens_seen": 3056533504 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003971343028229256, + "loss": 2.7335, + "theoretical_loss": 3.316001170672968, + "tokens_seen": 3056599040 + }, + { + "epoch": 0.21, + "learning_rate": 0.000397027373823781, + "loss": 2.6632, + "theoretical_loss": 3.315995725001205, + "tokens_seen": 3056664576 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003969204448246365, + "loss": 2.4579, + "theoretical_loss": 3.3159902794788887, + "tokens_seen": 3056730112 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039681351582549184, + "loss": 2.6843, + "theoretical_loss": 3.3159848341060125, + "tokens_seen": 3056795648 + }, + { + "epoch": 0.21, + "learning_rate": 0.0003967065868263473, + "loss": 2.6835, + "theoretical_loss": 3.3159793888825684, + "tokens_seen": 3056861184 + }, + { + "epoch": 0.21, + "learning_rate": 0.00039659965782720273, + "loss": 2.5826, + "theoretical_loss": 3.31597394380855, + "tokens_seen": 3056926720 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003964927288280582, + "loss": 2.6006, + "theoretical_loss": 3.315968498883949, + "tokens_seen": 3056992256 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039638579982891357, + "loss": 2.5882, + "theoretical_loss": 3.3159630541087592, + "tokens_seen": 3057057792 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039627887082976904, + "loss": 2.6443, + "theoretical_loss": 3.3159576094829726, + "tokens_seen": 3057123328 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003961719418306245, + "loss": 2.7833, + "theoretical_loss": 3.3159521650065815, + "tokens_seen": 3057188864 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003960650128314799, + "loss": 2.7132, + "theoretical_loss": 3.3159467206795794, + "tokens_seen": 3057254400 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039595808383233535, + "loss": 2.6205, + "theoretical_loss": 3.3159412765019587, + "tokens_seen": 3057319936 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039585115483319077, + "loss": 2.585, + "theoretical_loss": 3.3159358324737123, + "tokens_seen": 3057385472 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003957442258340462, + "loss": 2.749, + "theoretical_loss": 3.3159303885948326, + "tokens_seen": 3057451008 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003956372968349016, + "loss": 2.3975, + "theoretical_loss": 3.3159249448653125, + "tokens_seen": 3057516544 + }, + { + "epoch": 0.22, + "objective/train/docs_used": 1675857, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.825104236602783, + "objective/train/theoretical_loss": 3.3159195012851446, + "objective/train/tokens_used": 87633376, + "theoretical_loss": 3.3159195012851446, + "tokens_seen": 3057582080 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003955303678357571, + "loss": 2.6357, + "theoretical_loss": 3.3159195012851446, + "tokens_seen": 3057582080 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039542343883661245, + "loss": 2.6898, + "theoretical_loss": 3.3159140578543216, + "tokens_seen": 3057647616 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003953165098374679, + "loss": 2.7107, + "theoretical_loss": 3.315908614572836, + "tokens_seen": 3057713152 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003952095808383234, + "loss": 2.8232, + "theoretical_loss": 3.315903171440681, + "tokens_seen": 3057778688 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003951026518391788, + "loss": 2.6316, + "theoretical_loss": 3.315897728457849, + "tokens_seen": 3057844224 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039499572284003423, + "loss": 2.5617, + "theoretical_loss": 3.3158922856243325, + "tokens_seen": 3057909760 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039488879384088965, + "loss": 2.5367, + "theoretical_loss": 3.3158868429401247, + "tokens_seen": 3057975296 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003947818648417451, + "loss": 2.498, + "theoretical_loss": 3.3158814004052175, + "tokens_seen": 3058040832 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003946749358426005, + "loss": 2.7066, + "theoretical_loss": 3.3158759580196047, + "tokens_seen": 3058106368 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039456800684345596, + "loss": 2.3936, + "theoretical_loss": 3.3158705157832786, + "tokens_seen": 3058171904 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003944610778443114, + "loss": 2.7104, + "theoretical_loss": 3.315865073696231, + "tokens_seen": 3058237440 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003943541488451668, + "loss": 2.6365, + "theoretical_loss": 3.315859631758456, + "tokens_seen": 3058302976 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039424721984602227, + "loss": 2.5736, + "theoretical_loss": 3.3158541899699454, + "tokens_seen": 3058368512 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003941402908468777, + "loss": 2.6477, + "theoretical_loss": 3.3158487483306924, + "tokens_seen": 3058434048 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039403336184773316, + "loss": 2.6896, + "theoretical_loss": 3.315843306840689, + "tokens_seen": 3058499584 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003939264328485885, + "loss": 2.702, + "theoretical_loss": 3.3158378654999288, + "tokens_seen": 3058565120 + }, + { + "epoch": 0.22, + "learning_rate": 0.000393819503849444, + "loss": 2.4579, + "theoretical_loss": 3.315832424308404, + "tokens_seen": 3058630656 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003937125748502994, + "loss": 2.7327, + "theoretical_loss": 3.315826983266107, + "tokens_seen": 3058696192 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039360564585115483, + "loss": 2.5108, + "theoretical_loss": 3.3158215423730315, + "tokens_seen": 3058761728 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039349871685201025, + "loss": 2.3936, + "theoretical_loss": 3.31581610162917, + "tokens_seen": 3058827264 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003933917878528657, + "loss": 2.8121, + "theoretical_loss": 3.315810661034514, + "tokens_seen": 3058892800 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003932848588537211, + "loss": 2.3925, + "theoretical_loss": 3.3158052205890574, + "tokens_seen": 3058958336 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039317792985457656, + "loss": 2.5505, + "theoretical_loss": 3.3157997802927923, + "tokens_seen": 3059023872 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039307100085543204, + "loss": 2.6794, + "theoretical_loss": 3.315794340145712, + "tokens_seen": 3059089408 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003929640718562874, + "loss": 2.5868, + "theoretical_loss": 3.3157889001478087, + "tokens_seen": 3059154944 + }, + { + "epoch": 0.22, + "objective/train/docs_used": 1676525, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.8923184871673584, + "objective/train/theoretical_loss": 3.3157834602990754, + "objective/train/tokens_used": 89271776, + "theoretical_loss": 3.3157834602990754, + "tokens_seen": 3059220480 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003928571428571429, + "loss": 2.7254, + "theoretical_loss": 3.3157834602990754, + "tokens_seen": 3059220480 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003927502138579983, + "loss": 2.4594, + "theoretical_loss": 3.3157780205995047, + "tokens_seen": 3059286016 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039264328485885376, + "loss": 2.573, + "theoretical_loss": 3.3157725810490892, + "tokens_seen": 3059351552 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039253635585970913, + "loss": 2.729, + "theoretical_loss": 3.3157671416478216, + "tokens_seen": 3059417088 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003924294268605646, + "loss": 2.6558, + "theoretical_loss": 3.3157617023956956, + "tokens_seen": 3059482624 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039232249786142, + "loss": 2.6111, + "theoretical_loss": 3.3157562632927022, + "tokens_seen": 3059548160 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039221556886227544, + "loss": 2.6031, + "theoretical_loss": 3.3157508243388354, + "tokens_seen": 3059613696 + }, + { + "epoch": 0.22, + "learning_rate": 0.0003921086398631309, + "loss": 2.513, + "theoretical_loss": 3.315745385534087, + "tokens_seen": 3059679232 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039200171086398633, + "loss": 2.5943, + "theoretical_loss": 3.315739946878451, + "tokens_seen": 3059744768 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039189478186484175, + "loss": 2.6268, + "theoretical_loss": 3.3157345083719187, + "tokens_seen": 3059810304 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039178785286569717, + "loss": 2.5546, + "theoretical_loss": 3.3157290700144837, + "tokens_seen": 3059875840 + }, + { + "epoch": 0.22, + "learning_rate": 0.00039168092386655264, + "loss": 2.77, + "theoretical_loss": 3.3157236318061387, + "tokens_seen": 3059941376 + }, + { + "epoch": 0.22, + "learning_rate": 0.000391573994867408, + "loss": 2.6186, + "theoretical_loss": 3.315718193746876, + "tokens_seen": 3060006912 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003914670658682635, + "loss": 2.5831, + "theoretical_loss": 3.315712755836689, + "tokens_seen": 3060072448 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003913601368691189, + "loss": 2.7755, + "theoretical_loss": 3.315707318075569, + "tokens_seen": 3060137984 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039125320786997437, + "loss": 2.5705, + "theoretical_loss": 3.3157018804635103, + "tokens_seen": 3060203520 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003911462788708298, + "loss": 2.5262, + "theoretical_loss": 3.3156964430005047, + "tokens_seen": 3060269056 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003910393498716852, + "loss": 2.6334, + "theoretical_loss": 3.3156910056865456, + "tokens_seen": 3060334592 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003909324208725407, + "loss": 2.5361, + "theoretical_loss": 3.315685568521625, + "tokens_seen": 3060400128 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039082549187339604, + "loss": 2.7134, + "theoretical_loss": 3.315680131505736, + "tokens_seen": 3060465664 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003907185628742515, + "loss": 2.4863, + "theoretical_loss": 3.3156746946388713, + "tokens_seen": 3060531200 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039061163387510694, + "loss": 2.2524, + "theoretical_loss": 3.315669257921024, + "tokens_seen": 3060596736 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039050470487596235, + "loss": 2.677, + "theoretical_loss": 3.315663821352186, + "tokens_seen": 3060662272 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039039777587681777, + "loss": 2.7872, + "theoretical_loss": 3.3156583849323504, + "tokens_seen": 3060727808 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039029084687767325, + "loss": 2.5604, + "theoretical_loss": 3.3156529486615103, + "tokens_seen": 3060793344 + }, + { + "epoch": 0.23, + "objective/train/docs_used": 1677413, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.808596134185791, + "objective/train/theoretical_loss": 3.315647512539658, + "objective/train/tokens_used": 90910176, + "theoretical_loss": 3.315647512539658, + "tokens_seen": 3060858880 + }, + { + "epoch": 0.23, + "learning_rate": 0.00039018391787852866, + "loss": 2.6754, + "theoretical_loss": 3.315647512539658, + "tokens_seen": 3060858880 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003900769888793841, + "loss": 2.8492, + "theoretical_loss": 3.3156420765667862, + "tokens_seen": 3060924416 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038997005988023956, + "loss": 2.6422, + "theoretical_loss": 3.3156366407428877, + "tokens_seen": 3060989952 + }, + { + "epoch": 0.23, + "learning_rate": 0.000389863130881095, + "loss": 2.5492, + "theoretical_loss": 3.3156312050679553, + "tokens_seen": 3061055488 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003897562018819504, + "loss": 2.6541, + "theoretical_loss": 3.315625769541982, + "tokens_seen": 3061121024 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003896492728828058, + "loss": 2.4985, + "theoretical_loss": 3.31562033416496, + "tokens_seen": 3061186560 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003895423438836613, + "loss": 2.6753, + "theoretical_loss": 3.3156148989368823, + "tokens_seen": 3061252096 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038943541488451665, + "loss": 2.5798, + "theoretical_loss": 3.315609463857742, + "tokens_seen": 3061317632 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003893284858853721, + "loss": 2.8191, + "theoretical_loss": 3.315604028927531, + "tokens_seen": 3061383168 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003892215568862276, + "loss": 2.5816, + "theoretical_loss": 3.3155985941462425, + "tokens_seen": 3061448704 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038911462788708296, + "loss": 2.3644, + "theoretical_loss": 3.3155931595138695, + "tokens_seen": 3061514240 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038900769888793843, + "loss": 2.4628, + "theoretical_loss": 3.315587725030404, + "tokens_seen": 3061579776 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038890076988879385, + "loss": 2.6182, + "theoretical_loss": 3.3155822906958394, + "tokens_seen": 3061645312 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003887938408896493, + "loss": 2.7337, + "theoretical_loss": 3.315576856510168, + "tokens_seen": 3061710848 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003886869118905047, + "loss": 2.4686, + "theoretical_loss": 3.315571422473383, + "tokens_seen": 3061776384 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038857998289136016, + "loss": 2.6188, + "theoretical_loss": 3.315565988585477, + "tokens_seen": 3061841920 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003884730538922156, + "loss": 2.7897, + "theoretical_loss": 3.315560554846442, + "tokens_seen": 3061907456 + }, + { + "epoch": 0.23, + "learning_rate": 0.000388366124893071, + "loss": 2.5684, + "theoretical_loss": 3.3155551212562724, + "tokens_seen": 3061972992 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003882591958939264, + "loss": 2.8051, + "theoretical_loss": 3.315549687814959, + "tokens_seen": 3062038528 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003881522668947819, + "loss": 2.8572, + "theoretical_loss": 3.3155442545224956, + "tokens_seen": 3062104064 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003880453378956373, + "loss": 2.6365, + "theoretical_loss": 3.315538821378875, + "tokens_seen": 3062169600 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003879384088964927, + "loss": 2.6204, + "theoretical_loss": 3.315533388384089, + "tokens_seen": 3062235136 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003878314798973482, + "loss": 2.6727, + "theoretical_loss": 3.3155279555381316, + "tokens_seen": 3062300672 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038772455089820356, + "loss": 2.6589, + "theoretical_loss": 3.315522522840995, + "tokens_seen": 3062366208 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038761762189905904, + "loss": 2.3496, + "theoretical_loss": 3.315517090292672, + "tokens_seen": 3062431744 + }, + { + "epoch": 0.23, + "objective/train/docs_used": 1678257, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5287790298461914, + "objective/train/theoretical_loss": 3.3155116578931545, + "objective/train/tokens_used": 92548576, + "theoretical_loss": 3.3155116578931545, + "tokens_seen": 3062497280 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038751069289991446, + "loss": 2.6185, + "theoretical_loss": 3.3155116578931545, + "tokens_seen": 3062497280 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038740376390076993, + "loss": 2.6733, + "theoretical_loss": 3.3155062256424364, + "tokens_seen": 3062562816 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003872968349016253, + "loss": 2.5063, + "theoretical_loss": 3.31550079354051, + "tokens_seen": 3062628352 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038718990590248077, + "loss": 2.487, + "theoretical_loss": 3.3154953615873684, + "tokens_seen": 3062693888 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038708297690333624, + "loss": 2.5369, + "theoretical_loss": 3.315489929783004, + "tokens_seen": 3062759424 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003869760479041916, + "loss": 2.6101, + "theoretical_loss": 3.3154844981274088, + "tokens_seen": 3062824960 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003868691189050471, + "loss": 2.7359, + "theoretical_loss": 3.315479066620577, + "tokens_seen": 3062890496 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003867621899059025, + "loss": 2.6555, + "theoretical_loss": 3.3154736352625003, + "tokens_seen": 3062956032 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003866552609067579, + "loss": 2.7075, + "theoretical_loss": 3.3154682040531718, + "tokens_seen": 3063021568 + }, + { + "epoch": 0.23, + "learning_rate": 0.00038654833190761333, + "loss": 2.5476, + "theoretical_loss": 3.315462772992584, + "tokens_seen": 3063087104 + }, + { + "epoch": 0.23, + "learning_rate": 0.0003864414029084688, + "loss": 2.6774, + "theoretical_loss": 3.3154573420807303, + "tokens_seen": 3063152640 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038633447390932417, + "loss": 2.6647, + "theoretical_loss": 3.3154519113176026, + "tokens_seen": 3063218176 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038622754491017964, + "loss": 2.5317, + "theoretical_loss": 3.3154464807031943, + "tokens_seen": 3063283712 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003861206159110351, + "loss": 2.4487, + "theoretical_loss": 3.3154410502374976, + "tokens_seen": 3063349248 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038601368691189053, + "loss": 2.7459, + "theoretical_loss": 3.315435619920506, + "tokens_seen": 3063414784 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038590675791274595, + "loss": 2.4292, + "theoretical_loss": 3.3154301897522114, + "tokens_seen": 3063480320 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038579982891360137, + "loss": 2.6626, + "theoretical_loss": 3.315424759732607, + "tokens_seen": 3063545856 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038569289991445684, + "loss": 2.4853, + "theoretical_loss": 3.315419329861686, + "tokens_seen": 3063611392 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003855859709153122, + "loss": 2.7056, + "theoretical_loss": 3.31541390013944, + "tokens_seen": 3063676928 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003854790419161677, + "loss": 2.7404, + "theoretical_loss": 3.3154084705658624, + "tokens_seen": 3063742464 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003853721129170231, + "loss": 2.6673, + "theoretical_loss": 3.3154030411409465, + "tokens_seen": 3063808000 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003852651839178785, + "loss": 2.6915, + "theoretical_loss": 3.3153976118646837, + "tokens_seen": 3063873536 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038515825491873394, + "loss": 2.7155, + "theoretical_loss": 3.3153921827370683, + "tokens_seen": 3063939072 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003850513259195894, + "loss": 2.6748, + "theoretical_loss": 3.3153867537580917, + "tokens_seen": 3064004608 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038494439692044483, + "loss": 2.6202, + "theoretical_loss": 3.3153813249277473, + "tokens_seen": 3064070144 + }, + { + "epoch": 0.24, + "objective/train/docs_used": 1679004, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.687706708908081, + "objective/train/theoretical_loss": 3.315375896246028, + "objective/train/tokens_used": 94186976, + "theoretical_loss": 3.315375896246028, + "tokens_seen": 3064135680 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038483746792130025, + "loss": 2.4507, + "theoretical_loss": 3.315375896246028, + "tokens_seen": 3064135680 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003847305389221557, + "loss": 2.452, + "theoretical_loss": 3.3153704677129263, + "tokens_seen": 3064201216 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038462360992301114, + "loss": 2.7768, + "theoretical_loss": 3.3153650393284346, + "tokens_seen": 3064266752 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038451668092386656, + "loss": 2.806, + "theoretical_loss": 3.3153596110925467, + "tokens_seen": 3064332288 + }, + { + "epoch": 0.24, + "learning_rate": 0.000384409751924722, + "loss": 2.5273, + "theoretical_loss": 3.315354183005254, + "tokens_seen": 3064397824 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038430282292557745, + "loss": 2.5489, + "theoretical_loss": 3.315348755066551, + "tokens_seen": 3064463360 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003841958939264328, + "loss": 2.5144, + "theoretical_loss": 3.315343327276428, + "tokens_seen": 3064528896 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003840889649272883, + "loss": 2.6007, + "theoretical_loss": 3.31533789963488, + "tokens_seen": 3064594432 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038398203592814376, + "loss": 2.5804, + "theoretical_loss": 3.315332472141899, + "tokens_seen": 3064659968 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003838751069289991, + "loss": 2.5643, + "theoretical_loss": 3.3153270447974776, + "tokens_seen": 3064725504 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003837681779298546, + "loss": 2.4473, + "theoretical_loss": 3.315321617601608, + "tokens_seen": 3064791040 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038366124893071, + "loss": 2.6153, + "theoretical_loss": 3.3153161905542845, + "tokens_seen": 3064856576 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038355431993156543, + "loss": 2.5416, + "theoretical_loss": 3.315310763655498, + "tokens_seen": 3064922112 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038344739093242085, + "loss": 2.6112, + "theoretical_loss": 3.315305336905243, + "tokens_seen": 3064987648 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003833404619332763, + "loss": 2.6796, + "theoretical_loss": 3.3152999103035112, + "tokens_seen": 3065053184 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038323353293413174, + "loss": 2.6893, + "theoretical_loss": 3.3152944838502956, + "tokens_seen": 3065118720 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038312660393498716, + "loss": 2.6505, + "theoretical_loss": 3.315289057545589, + "tokens_seen": 3065184256 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038301967493584263, + "loss": 2.8139, + "theoretical_loss": 3.3152836313893843, + "tokens_seen": 3065249792 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038291274593669805, + "loss": 2.3723, + "theoretical_loss": 3.3152782053816736, + "tokens_seen": 3065315328 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038280581693755347, + "loss": 2.4177, + "theoretical_loss": 3.3152727795224504, + "tokens_seen": 3065380864 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003826988879384089, + "loss": 2.6492, + "theoretical_loss": 3.3152673538117075, + "tokens_seen": 3065446400 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038259195893926436, + "loss": 2.6787, + "theoretical_loss": 3.3152619282494373, + "tokens_seen": 3065511936 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038248502994011973, + "loss": 2.7785, + "theoretical_loss": 3.3152565028356324, + "tokens_seen": 3065577472 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003823781009409752, + "loss": 2.5369, + "theoretical_loss": 3.315251077570286, + "tokens_seen": 3065643008 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003822711719418306, + "loss": 2.764, + "theoretical_loss": 3.315245652453391, + "tokens_seen": 3065708544 + }, + { + "epoch": 0.24, + "objective/train/docs_used": 1680022, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6200811862945557, + "objective/train/theoretical_loss": 3.3152402274849395, + "objective/train/tokens_used": 95825376, + "theoretical_loss": 3.3152402274849395, + "tokens_seen": 3065774080 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003821642429426861, + "loss": 2.7352, + "theoretical_loss": 3.3152402274849395, + "tokens_seen": 3065774080 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003820573139435415, + "loss": 2.552, + "theoretical_loss": 3.3152348026649245, + "tokens_seen": 3065839616 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038195038494439693, + "loss": 2.586, + "theoretical_loss": 3.315229377993339, + "tokens_seen": 3065905152 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003818434559452524, + "loss": 2.666, + "theoretical_loss": 3.3152239534701753, + "tokens_seen": 3065970688 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038173652694610777, + "loss": 2.643, + "theoretical_loss": 3.315218529095427, + "tokens_seen": 3066036224 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038162959794696324, + "loss": 2.6234, + "theoretical_loss": 3.3152131048690863, + "tokens_seen": 3066101760 + }, + { + "epoch": 0.24, + "learning_rate": 0.00038152266894781866, + "loss": 2.6121, + "theoretical_loss": 3.315207680791146, + "tokens_seen": 3066167296 + }, + { + "epoch": 0.24, + "learning_rate": 0.0003814157399486741, + "loss": 2.6519, + "theoretical_loss": 3.315202256861599, + "tokens_seen": 3066232832 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003813088109495295, + "loss": 2.6287, + "theoretical_loss": 3.3151968330804378, + "tokens_seen": 3066298368 + }, + { + "epoch": 0.25, + "learning_rate": 0.00038120188195038497, + "loss": 2.4321, + "theoretical_loss": 3.3151914094476553, + "tokens_seen": 3066363904 + }, + { + "epoch": 0.25, + "learning_rate": 0.00038109495295124033, + "loss": 2.7344, + "theoretical_loss": 3.3151859859632444, + "tokens_seen": 3066429440 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003809880239520958, + "loss": 2.4239, + "theoretical_loss": 3.315180562627198, + "tokens_seen": 3066494976 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003808810949529513, + "loss": 2.6798, + "theoretical_loss": 3.3151751394395084, + "tokens_seen": 3066560512 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003807741659538067, + "loss": 2.6429, + "theoretical_loss": 3.3151697164001686, + "tokens_seen": 3066626048 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003806672369546621, + "loss": 2.6308, + "theoretical_loss": 3.3151642935091714, + "tokens_seen": 3066691584 + }, + { + "epoch": 0.25, + "learning_rate": 0.00038056030795551753, + "loss": 2.4593, + "theoretical_loss": 3.31515887076651, + "tokens_seen": 3066757120 + }, + { + "epoch": 0.25, + "learning_rate": 0.000380453378956373, + "loss": 2.6, + "theoretical_loss": 3.3151534481721763, + "tokens_seen": 3066822656 + }, + { + "epoch": 0.25, + "learning_rate": 0.00038034644995722837, + "loss": 2.689, + "theoretical_loss": 3.3151480257261636, + "tokens_seen": 3066888192 + }, + { + "epoch": 0.25, + "learning_rate": 0.00038023952095808384, + "loss": 2.6198, + "theoretical_loss": 3.3151426034284643, + "tokens_seen": 3066953728 + }, + { + "epoch": 0.25, + "learning_rate": 0.00038013259195893926, + "loss": 2.774, + "theoretical_loss": 3.315137181279072, + "tokens_seen": 3067019264 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003800256629597947, + "loss": 2.4105, + "theoretical_loss": 3.3151317592779788, + "tokens_seen": 3067084800 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037991873396065015, + "loss": 2.5212, + "theoretical_loss": 3.3151263374251774, + "tokens_seen": 3067150336 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003798118049615056, + "loss": 2.5729, + "theoretical_loss": 3.315120915720661, + "tokens_seen": 3067215872 + }, + { + "epoch": 0.25, + "learning_rate": 0.000379704875962361, + "loss": 2.5886, + "theoretical_loss": 3.3151154941644223, + "tokens_seen": 3067281408 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003795979469632164, + "loss": 2.345, + "theoretical_loss": 3.315110072756454, + "tokens_seen": 3067346944 + }, + { + "epoch": 0.25, + "objective/train/docs_used": 1680696, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1590898036956787, + "objective/train/theoretical_loss": 3.3151046514967484, + "objective/train/tokens_used": 97463776, + "theoretical_loss": 3.3151046514967484, + "tokens_seen": 3067412480 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003794910179640719, + "loss": 2.4677, + "theoretical_loss": 3.3151046514967484, + "tokens_seen": 3067412480 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003793840889649273, + "loss": 2.6849, + "theoretical_loss": 3.315099230385299, + "tokens_seen": 3067478016 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003792771599657827, + "loss": 2.6453, + "theoretical_loss": 3.3150938094220983, + "tokens_seen": 3067543552 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037917023096663814, + "loss": 2.5762, + "theoretical_loss": 3.3150883886071387, + "tokens_seen": 3067609088 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003790633019674936, + "loss": 2.3638, + "theoretical_loss": 3.3150829679404135, + "tokens_seen": 3067674624 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037895637296834903, + "loss": 2.5771, + "theoretical_loss": 3.3150775474219154, + "tokens_seen": 3067740160 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037884944396920445, + "loss": 2.578, + "theoretical_loss": 3.3150721270516375, + "tokens_seen": 3067805696 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003787425149700599, + "loss": 2.7761, + "theoretical_loss": 3.3150667068295716, + "tokens_seen": 3067871232 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003786355859709153, + "loss": 2.6044, + "theoretical_loss": 3.315061286755711, + "tokens_seen": 3067936768 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037852865697177076, + "loss": 2.7257, + "theoretical_loss": 3.3150558668300487, + "tokens_seen": 3068002304 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003784217279726262, + "loss": 2.5639, + "theoretical_loss": 3.3150504470525775, + "tokens_seen": 3068067840 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003783147989734816, + "loss": 2.9122, + "theoretical_loss": 3.31504502742329, + "tokens_seen": 3068133376 + }, + { + "epoch": 0.25, + "learning_rate": 0.000378207869974337, + "loss": 2.6439, + "theoretical_loss": 3.315039607942179, + "tokens_seen": 3068198912 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003781009409751925, + "loss": 2.6043, + "theoretical_loss": 3.3150341886092374, + "tokens_seen": 3068264448 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037799401197604796, + "loss": 2.7199, + "theoretical_loss": 3.3150287694244573, + "tokens_seen": 3068329984 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003778870829769033, + "loss": 2.4025, + "theoretical_loss": 3.3150233503878326, + "tokens_seen": 3068395520 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003777801539777588, + "loss": 2.7244, + "theoretical_loss": 3.3150179314993555, + "tokens_seen": 3068461056 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003776732249786142, + "loss": 2.7027, + "theoretical_loss": 3.3150125127590186, + "tokens_seen": 3068526592 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037756629597946964, + "loss": 2.4484, + "theoretical_loss": 3.315007094166815, + "tokens_seen": 3068592128 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037745936698032505, + "loss": 2.7013, + "theoretical_loss": 3.3150016757227374, + "tokens_seen": 3068657664 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003773524379811805, + "loss": 2.7631, + "theoretical_loss": 3.3149962574267784, + "tokens_seen": 3068723200 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003772455089820359, + "loss": 2.665, + "theoretical_loss": 3.314990839278931, + "tokens_seen": 3068788736 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037713857998289136, + "loss": 2.7216, + "theoretical_loss": 3.314985421279188, + "tokens_seen": 3068854272 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037703165098374684, + "loss": 2.6826, + "theoretical_loss": 3.314980003427542, + "tokens_seen": 3068919808 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037692472198460226, + "loss": 2.3532, + "theoretical_loss": 3.3149745857239865, + "tokens_seen": 3068985344 + }, + { + "epoch": 0.25, + "objective/train/docs_used": 1681971, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7027196884155273, + "objective/train/theoretical_loss": 3.3149691681685134, + "objective/train/tokens_used": 99102176, + "theoretical_loss": 3.3149691681685134, + "tokens_seen": 3069050880 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003768177929854577, + "loss": 2.6228, + "theoretical_loss": 3.3149691681685134, + "tokens_seen": 3069050880 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003767108639863131, + "loss": 2.5265, + "theoretical_loss": 3.3149637507611156, + "tokens_seen": 3069116416 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037660393498716857, + "loss": 2.7061, + "theoretical_loss": 3.314958333501786, + "tokens_seen": 3069181952 + }, + { + "epoch": 0.25, + "learning_rate": 0.00037649700598802393, + "loss": 2.6572, + "theoretical_loss": 3.314952916390518, + "tokens_seen": 3069247488 + }, + { + "epoch": 0.25, + "learning_rate": 0.0003763900769888794, + "loss": 2.5888, + "theoretical_loss": 3.3149474994273036, + "tokens_seen": 3069313024 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003762831479897348, + "loss": 2.5132, + "theoretical_loss": 3.314942082612136, + "tokens_seen": 3069378560 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037617621899059024, + "loss": 2.367, + "theoretical_loss": 3.314936665945008, + "tokens_seen": 3069444096 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037606928999144566, + "loss": 2.4176, + "theoretical_loss": 3.3149312494259116, + "tokens_seen": 3069509632 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037596236099230113, + "loss": 2.5258, + "theoretical_loss": 3.314925833054841, + "tokens_seen": 3069575168 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037585543199315655, + "loss": 2.635, + "theoretical_loss": 3.3149204168317876, + "tokens_seen": 3069640704 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037574850299401197, + "loss": 2.6698, + "theoretical_loss": 3.3149150007567454, + "tokens_seen": 3069706240 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037564157399486744, + "loss": 2.3874, + "theoretical_loss": 3.3149095848297065, + "tokens_seen": 3069771776 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037553464499572286, + "loss": 2.5278, + "theoretical_loss": 3.314904169050664, + "tokens_seen": 3069837312 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003754277159965783, + "loss": 2.6473, + "theoretical_loss": 3.31489875341961, + "tokens_seen": 3069902848 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003753207869974337, + "loss": 2.5471, + "theoretical_loss": 3.3148933379365384, + "tokens_seen": 3069968384 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037521385799828917, + "loss": 2.5681, + "theoretical_loss": 3.314887922601441, + "tokens_seen": 3070033920 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037510692899914453, + "loss": 2.7144, + "theoretical_loss": 3.3148825074143113, + "tokens_seen": 3070099456 + }, + { + "epoch": 0.26, + "learning_rate": 0.000375, + "loss": 2.8148, + "theoretical_loss": 3.314877092375142, + "tokens_seen": 3070164992 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003748930710008555, + "loss": 2.5621, + "theoretical_loss": 3.314871677483925, + "tokens_seen": 3070230528 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037478614200171085, + "loss": 2.6176, + "theoretical_loss": 3.3148662627406544, + "tokens_seen": 3070296064 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003746792130025663, + "loss": 2.4838, + "theoretical_loss": 3.3148608481453223, + "tokens_seen": 3070361600 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037457228400342174, + "loss": 2.5001, + "theoretical_loss": 3.3148554336979217, + "tokens_seen": 3070427136 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037446535500427716, + "loss": 2.5305, + "theoretical_loss": 3.3148500193984454, + "tokens_seen": 3070492672 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003743584260051326, + "loss": 2.7008, + "theoretical_loss": 3.3148446052468863, + "tokens_seen": 3070558208 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037425149700598805, + "loss": 2.7314, + "theoretical_loss": 3.3148391912432364, + "tokens_seen": 3070623744 + }, + { + "epoch": 0.26, + "objective/train/docs_used": 1682525, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.117476463317871, + "objective/train/theoretical_loss": 3.3148337773874896, + "objective/train/tokens_used": 100740576, + "theoretical_loss": 3.3148337773874896, + "tokens_seen": 3070689280 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037414456800684347, + "loss": 2.3422, + "theoretical_loss": 3.3148337773874896, + "tokens_seen": 3070689280 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003740376390076989, + "loss": 2.6615, + "theoretical_loss": 3.3148283636796383, + "tokens_seen": 3070754816 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037393071000855436, + "loss": 2.6344, + "theoretical_loss": 3.314822950119675, + "tokens_seen": 3070820352 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003738237810094098, + "loss": 2.5218, + "theoretical_loss": 3.3148175367075927, + "tokens_seen": 3070885888 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003737168520102652, + "loss": 2.4801, + "theoretical_loss": 3.3148121234433843, + "tokens_seen": 3070951424 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003736099230111206, + "loss": 2.4986, + "theoretical_loss": 3.314806710327043, + "tokens_seen": 3071016960 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003735029940119761, + "loss": 2.6203, + "theoretical_loss": 3.3148012973585606, + "tokens_seen": 3071082496 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037339606501283145, + "loss": 2.6511, + "theoretical_loss": 3.3147958845379306, + "tokens_seen": 3071148032 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003732891360136869, + "loss": 2.7088, + "theoretical_loss": 3.3147904718651455, + "tokens_seen": 3071213568 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037318220701454234, + "loss": 2.7417, + "theoretical_loss": 3.3147850593401986, + "tokens_seen": 3071279104 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037307527801539776, + "loss": 2.6584, + "theoretical_loss": 3.3147796469630824, + "tokens_seen": 3071344640 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003729683490162532, + "loss": 2.6716, + "theoretical_loss": 3.3147742347337896, + "tokens_seen": 3071410176 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037286142001710865, + "loss": 2.6926, + "theoretical_loss": 3.314768822652313, + "tokens_seen": 3071475712 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003727544910179641, + "loss": 2.6427, + "theoretical_loss": 3.3147634107186454, + "tokens_seen": 3071541248 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003726475620188195, + "loss": 2.749, + "theoretical_loss": 3.3147579989327802, + "tokens_seen": 3071606784 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037254063301967496, + "loss": 2.7656, + "theoretical_loss": 3.3147525872947092, + "tokens_seen": 3071672320 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003724337040205304, + "loss": 2.5801, + "theoretical_loss": 3.3147471758044262, + "tokens_seen": 3071737856 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003723267750213858, + "loss": 2.5855, + "theoretical_loss": 3.3147417644619233, + "tokens_seen": 3071803392 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003722198460222412, + "loss": 2.7107, + "theoretical_loss": 3.3147363532671936, + "tokens_seen": 3071868928 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003721129170230967, + "loss": 2.6251, + "theoretical_loss": 3.31473094222023, + "tokens_seen": 3071934464 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037200598802395205, + "loss": 2.399, + "theoretical_loss": 3.3147255313210247, + "tokens_seen": 3072000000 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037189905902480753, + "loss": 2.6327, + "theoretical_loss": 3.3147201205695715, + "tokens_seen": 3072065536 + }, + { + "epoch": 0.26, + "learning_rate": 0.000371792130025663, + "loss": 2.7753, + "theoretical_loss": 3.3147147099658625, + "tokens_seen": 3072131072 + }, + { + "epoch": 0.26, + "learning_rate": 0.0003716852010265184, + "loss": 2.5981, + "theoretical_loss": 3.3147092995098912, + "tokens_seen": 3072196608 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037157827202737384, + "loss": 2.5691, + "theoretical_loss": 3.3147038892016494, + "tokens_seen": 3072262144 + }, + { + "epoch": 0.26, + "objective/train/docs_used": 1683794, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.220975875854492, + "objective/train/theoretical_loss": 3.3146984790411307, + "objective/train/tokens_used": 102378976, + "theoretical_loss": 3.3146984790411307, + "tokens_seen": 3072327680 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037147134302822926, + "loss": 2.6382, + "theoretical_loss": 3.3146984790411307, + "tokens_seen": 3072327680 + }, + { + "epoch": 0.26, + "learning_rate": 0.00037136441402908473, + "loss": 2.4398, + "theoretical_loss": 3.3146930690283276, + "tokens_seen": 3072393216 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003712574850299401, + "loss": 2.5308, + "theoretical_loss": 3.314687659163233, + "tokens_seen": 3072458752 + }, + { + "epoch": 0.27, + "learning_rate": 0.00037115055603079557, + "loss": 2.6647, + "theoretical_loss": 3.3146822494458394, + "tokens_seen": 3072524288 + }, + { + "epoch": 0.27, + "learning_rate": 0.000371043627031651, + "loss": 2.609, + "theoretical_loss": 3.3146768398761406, + "tokens_seen": 3072589824 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003709366980325064, + "loss": 2.5539, + "theoretical_loss": 3.3146714304541285, + "tokens_seen": 3072655360 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003708297690333619, + "loss": 2.5673, + "theoretical_loss": 3.3146660211797956, + "tokens_seen": 3072720896 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003707228400342173, + "loss": 2.534, + "theoretical_loss": 3.314660612053136, + "tokens_seen": 3072786432 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003706159110350727, + "loss": 2.467, + "theoretical_loss": 3.3146552030741416, + "tokens_seen": 3072851968 + }, + { + "epoch": 0.27, + "learning_rate": 0.00037050898203592813, + "loss": 2.5527, + "theoretical_loss": 3.314649794242805, + "tokens_seen": 3072917504 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003704020530367836, + "loss": 2.8393, + "theoretical_loss": 3.31464438555912, + "tokens_seen": 3072983040 + }, + { + "epoch": 0.27, + "learning_rate": 0.000370295124037639, + "loss": 2.5045, + "theoretical_loss": 3.314638977023079, + "tokens_seen": 3073048576 + }, + { + "epoch": 0.27, + "learning_rate": 0.00037018819503849444, + "loss": 2.5541, + "theoretical_loss": 3.314633568634674, + "tokens_seen": 3073114112 + }, + { + "epoch": 0.27, + "learning_rate": 0.00037008126603934986, + "loss": 2.5825, + "theoretical_loss": 3.314628160393899, + "tokens_seen": 3073179648 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036997433704020533, + "loss": 2.5034, + "theoretical_loss": 3.314622752300746, + "tokens_seen": 3073245184 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036986740804106075, + "loss": 2.5611, + "theoretical_loss": 3.3146173443552085, + "tokens_seen": 3073310720 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036976047904191617, + "loss": 2.5575, + "theoretical_loss": 3.3146119365572786, + "tokens_seen": 3073376256 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036965355004277164, + "loss": 2.4762, + "theoretical_loss": 3.3146065289069497, + "tokens_seen": 3073441792 + }, + { + "epoch": 0.27, + "learning_rate": 0.000369546621043627, + "loss": 2.8237, + "theoretical_loss": 3.3146011214042144, + "tokens_seen": 3073507328 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003694396920444825, + "loss": 2.6976, + "theoretical_loss": 3.3145957140490654, + "tokens_seen": 3073572864 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003693327630453379, + "loss": 2.7649, + "theoretical_loss": 3.314590306841496, + "tokens_seen": 3073638400 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003692258340461933, + "loss": 2.5391, + "theoretical_loss": 3.3145848997814986, + "tokens_seen": 3073703936 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036911890504704874, + "loss": 2.6388, + "theoretical_loss": 3.3145794928690657, + "tokens_seen": 3073769472 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003690119760479042, + "loss": 2.6619, + "theoretical_loss": 3.314574086104191, + "tokens_seen": 3073835008 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003689050470487597, + "loss": 2.8502, + "theoretical_loss": 3.3145686794868667, + "tokens_seen": 3073900544 + }, + { + "epoch": 0.27, + "objective/train/docs_used": 1684169, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.301888942718506, + "objective/train/theoretical_loss": 3.314563273017086, + "objective/train/tokens_used": 104017376, + "theoretical_loss": 3.314563273017086, + "tokens_seen": 3073966080 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036879811804961505, + "loss": 2.4423, + "theoretical_loss": 3.314563273017086, + "tokens_seen": 3073966080 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003686911890504705, + "loss": 2.7374, + "theoretical_loss": 3.314557866694841, + "tokens_seen": 3074031616 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036858426005132594, + "loss": 2.7202, + "theoretical_loss": 3.314552460520126, + "tokens_seen": 3074097152 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036847733105218136, + "loss": 2.7584, + "theoretical_loss": 3.314547054492932, + "tokens_seen": 3074162688 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003683704020530368, + "loss": 2.6721, + "theoretical_loss": 3.314541648613253, + "tokens_seen": 3074228224 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036826347305389225, + "loss": 2.6404, + "theoretical_loss": 3.314536242881082, + "tokens_seen": 3074293760 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003681565440547476, + "loss": 2.6355, + "theoretical_loss": 3.314530837296411, + "tokens_seen": 3074359296 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003680496150556031, + "loss": 2.3759, + "theoretical_loss": 3.3145254318592325, + "tokens_seen": 3074424832 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003679426860564585, + "loss": 2.5193, + "theoretical_loss": 3.314520026569541, + "tokens_seen": 3074490368 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003678357570573139, + "loss": 2.5996, + "theoretical_loss": 3.3145146214273282, + "tokens_seen": 3074555904 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003677288280581694, + "loss": 2.5103, + "theoretical_loss": 3.3145092164325867, + "tokens_seen": 3074621440 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003676218990590248, + "loss": 2.8836, + "theoretical_loss": 3.3145038115853103, + "tokens_seen": 3074686976 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003675149700598803, + "loss": 2.682, + "theoretical_loss": 3.314498406885491, + "tokens_seen": 3074752512 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036740804106073565, + "loss": 2.7615, + "theoretical_loss": 3.314493002333122, + "tokens_seen": 3074818048 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003673011120615911, + "loss": 2.6206, + "theoretical_loss": 3.314487597928196, + "tokens_seen": 3074883584 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036719418306244654, + "loss": 2.7439, + "theoretical_loss": 3.3144821936707056, + "tokens_seen": 3074949120 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036708725406330196, + "loss": 2.7887, + "theoretical_loss": 3.3144767895606444, + "tokens_seen": 3075014656 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003669803250641574, + "loss": 2.7711, + "theoretical_loss": 3.3144713855980044, + "tokens_seen": 3075080192 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036687339606501285, + "loss": 2.5827, + "theoretical_loss": 3.314465981782779, + "tokens_seen": 3075145728 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036676646706586827, + "loss": 2.9233, + "theoretical_loss": 3.3144605781149608, + "tokens_seen": 3075211264 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003666595380667237, + "loss": 2.63, + "theoretical_loss": 3.3144551745945425, + "tokens_seen": 3075276800 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036655260906757916, + "loss": 2.6958, + "theoretical_loss": 3.314449771221517, + "tokens_seen": 3075342336 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003664456800684346, + "loss": 2.5697, + "theoretical_loss": 3.3144443679958773, + "tokens_seen": 3075407872 + }, + { + "epoch": 0.27, + "learning_rate": 0.00036633875106929, + "loss": 2.6981, + "theoretical_loss": 3.3144389649176165, + "tokens_seen": 3075473408 + }, + { + "epoch": 0.27, + "learning_rate": 0.0003662318220701454, + "loss": 2.6977, + "theoretical_loss": 3.3144335619867267, + "tokens_seen": 3075538944 + }, + { + "epoch": 0.27, + "objective/train/docs_used": 1684863, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.703925132751465, + "objective/train/theoretical_loss": 3.3144281592032017, + "objective/train/tokens_used": 105655776, + "theoretical_loss": 3.3144281592032017, + "tokens_seen": 3075604480 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003661248930710009, + "loss": 2.7531, + "theoretical_loss": 3.3144281592032017, + "tokens_seen": 3075604480 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036601796407185626, + "loss": 2.5068, + "theoretical_loss": 3.3144227565670334, + "tokens_seen": 3075670016 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036591103507271173, + "loss": 2.6512, + "theoretical_loss": 3.3144173540782154, + "tokens_seen": 3075735552 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003658041060735672, + "loss": 2.6823, + "theoretical_loss": 3.3144119517367394, + "tokens_seen": 3075801088 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036569717707442257, + "loss": 2.6634, + "theoretical_loss": 3.3144065495426, + "tokens_seen": 3075866624 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036559024807527804, + "loss": 2.6239, + "theoretical_loss": 3.314401147495788, + "tokens_seen": 3075932160 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036548331907613346, + "loss": 2.5854, + "theoretical_loss": 3.3143957455962982, + "tokens_seen": 3075997696 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003653763900769889, + "loss": 2.684, + "theoretical_loss": 3.3143903438441225, + "tokens_seen": 3076063232 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003652694610778443, + "loss": 2.7387, + "theoretical_loss": 3.3143849422392533, + "tokens_seen": 3076128768 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036516253207869977, + "loss": 2.6322, + "theoretical_loss": 3.314379540781684, + "tokens_seen": 3076194304 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003650556030795552, + "loss": 2.6662, + "theoretical_loss": 3.3143741394714077, + "tokens_seen": 3076259840 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003649486740804106, + "loss": 2.7186, + "theoretical_loss": 3.314368738308417, + "tokens_seen": 3076325376 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003648417450812661, + "loss": 2.621, + "theoretical_loss": 3.314363337292704, + "tokens_seen": 3076390912 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003647348160821215, + "loss": 2.5464, + "theoretical_loss": 3.314357936424263, + "tokens_seen": 3076456448 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003646278870829769, + "loss": 2.5827, + "theoretical_loss": 3.314352535703086, + "tokens_seen": 3076521984 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036452095808383234, + "loss": 2.8452, + "theoretical_loss": 3.3143471351291653, + "tokens_seen": 3076587520 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003644140290846878, + "loss": 2.7305, + "theoretical_loss": 3.314341734702495, + "tokens_seen": 3076653056 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036430710008554317, + "loss": 2.6206, + "theoretical_loss": 3.314336334423067, + "tokens_seen": 3076718592 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036420017108639865, + "loss": 2.4647, + "theoretical_loss": 3.3143309342908744, + "tokens_seen": 3076784128 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036409324208725406, + "loss": 2.6854, + "theoretical_loss": 3.3143255343059104, + "tokens_seen": 3076849664 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003639863130881095, + "loss": 2.6977, + "theoretical_loss": 3.314320134468167, + "tokens_seen": 3076915200 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003638793840889649, + "loss": 2.6815, + "theoretical_loss": 3.3143147347776383, + "tokens_seen": 3076980736 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003637724550898204, + "loss": 2.6089, + "theoretical_loss": 3.3143093352343165, + "tokens_seen": 3077046272 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036366552609067585, + "loss": 2.5925, + "theoretical_loss": 3.314303935838194, + "tokens_seen": 3077111808 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003635585970915312, + "loss": 2.6264, + "theoretical_loss": 3.314298536589264, + "tokens_seen": 3077177344 + }, + { + "epoch": 0.28, + "objective/train/docs_used": 1686089, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 3.2093522548675537, + "objective/train/theoretical_loss": 3.3142931374875197, + "objective/train/tokens_used": 107294176, + "theoretical_loss": 3.3142931374875197, + "tokens_seen": 3077242880 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003634516680923867, + "loss": 2.7345, + "theoretical_loss": 3.3142931374875197, + "tokens_seen": 3077242880 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003633447390932421, + "loss": 2.7784, + "theoretical_loss": 3.3142877385329537, + "tokens_seen": 3077308416 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003632378100940975, + "loss": 2.5397, + "theoretical_loss": 3.3142823397255587, + "tokens_seen": 3077373952 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036313088109495294, + "loss": 2.4738, + "theoretical_loss": 3.314276941065328, + "tokens_seen": 3077439488 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003630239520958084, + "loss": 2.3092, + "theoretical_loss": 3.3142715425522535, + "tokens_seen": 3077505024 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003629170230966638, + "loss": 2.5891, + "theoretical_loss": 3.3142661441863295, + "tokens_seen": 3077570560 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036281009409751925, + "loss": 2.7577, + "theoretical_loss": 3.3142607459675473, + "tokens_seen": 3077636096 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003627031650983747, + "loss": 2.4505, + "theoretical_loss": 3.3142553478959007, + "tokens_seen": 3077701632 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003625962360992301, + "loss": 2.5934, + "theoretical_loss": 3.3142499499713827, + "tokens_seen": 3077767168 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036248930710008556, + "loss": 2.7962, + "theoretical_loss": 3.3142445521939856, + "tokens_seen": 3077832704 + }, + { + "epoch": 0.28, + "learning_rate": 0.000362382378100941, + "loss": 2.5607, + "theoretical_loss": 3.3142391545637024, + "tokens_seen": 3077898240 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036227544910179645, + "loss": 2.4861, + "theoretical_loss": 3.314233757080526, + "tokens_seen": 3077963776 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003621685201026518, + "loss": 2.5669, + "theoretical_loss": 3.3142283597444493, + "tokens_seen": 3078029312 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003620615911035073, + "loss": 2.7184, + "theoretical_loss": 3.3142229625554656, + "tokens_seen": 3078094848 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003619546621043627, + "loss": 2.7788, + "theoretical_loss": 3.314217565513567, + "tokens_seen": 3078160384 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003618477331052181, + "loss": 2.7354, + "theoretical_loss": 3.3142121686187465, + "tokens_seen": 3078225920 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003617408041060736, + "loss": 2.4858, + "theoretical_loss": 3.3142067718709973, + "tokens_seen": 3078291456 + }, + { + "epoch": 0.28, + "learning_rate": 0.000361633875106929, + "loss": 2.6159, + "theoretical_loss": 3.3142013752703123, + "tokens_seen": 3078356992 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036152694610778444, + "loss": 2.728, + "theoretical_loss": 3.3141959788166835, + "tokens_seen": 3078422528 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036142001710863986, + "loss": 2.6238, + "theoretical_loss": 3.314190582510105, + "tokens_seen": 3078488064 + }, + { + "epoch": 0.28, + "learning_rate": 0.00036131308810949533, + "loss": 2.6783, + "theoretical_loss": 3.314185186350569, + "tokens_seen": 3078553600 + }, + { + "epoch": 0.28, + "learning_rate": 0.0003612061591103507, + "loss": 2.5088, + "theoretical_loss": 3.3141797903380685, + "tokens_seen": 3078619136 + }, + { + "epoch": 0.29, + "learning_rate": 0.00036109923011120617, + "loss": 2.7777, + "theoretical_loss": 3.3141743944725963, + "tokens_seen": 3078684672 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003609923011120616, + "loss": 2.5672, + "theoretical_loss": 3.314168998754145, + "tokens_seen": 3078750208 + }, + { + "epoch": 0.29, + "learning_rate": 0.00036088537211291706, + "loss": 2.7797, + "theoretical_loss": 3.314163603182708, + "tokens_seen": 3078815744 + }, + { + "epoch": 0.29, + "objective/train/docs_used": 1687537, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7404658794403076, + "objective/train/theoretical_loss": 3.314158207758278, + "objective/train/tokens_used": 108932576, + "theoretical_loss": 3.314158207758278, + "tokens_seen": 3078881280 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003607784431137724, + "loss": 2.638, + "theoretical_loss": 3.314158207758278, + "tokens_seen": 3078881280 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003606715141146279, + "loss": 2.6652, + "theoretical_loss": 3.3141528124808475, + "tokens_seen": 3078946816 + }, + { + "epoch": 0.29, + "learning_rate": 0.00036056458511548337, + "loss": 2.5673, + "theoretical_loss": 3.3141474173504095, + "tokens_seen": 3079012352 + }, + { + "epoch": 0.29, + "learning_rate": 0.00036045765611633873, + "loss": 2.6115, + "theoretical_loss": 3.3141420223669575, + "tokens_seen": 3079077888 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003603507271171942, + "loss": 2.4179, + "theoretical_loss": 3.3141366275304835, + "tokens_seen": 3079143424 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003602437981180496, + "loss": 2.587, + "theoretical_loss": 3.314131232840981, + "tokens_seen": 3079208960 + }, + { + "epoch": 0.29, + "learning_rate": 0.00036013686911890504, + "loss": 2.4774, + "theoretical_loss": 3.3141258382984424, + "tokens_seen": 3079274496 + }, + { + "epoch": 0.29, + "learning_rate": 0.00036002994011976046, + "loss": 2.594, + "theoretical_loss": 3.3141204439028606, + "tokens_seen": 3079340032 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035992301112061593, + "loss": 2.623, + "theoretical_loss": 3.3141150496542293, + "tokens_seen": 3079405568 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035981608212147135, + "loss": 2.7003, + "theoretical_loss": 3.31410965555254, + "tokens_seen": 3079471104 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035970915312232677, + "loss": 2.7533, + "theoretical_loss": 3.3141042615977865, + "tokens_seen": 3079536640 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035960222412318224, + "loss": 2.6504, + "theoretical_loss": 3.3140988677899617, + "tokens_seen": 3079602176 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035949529512403766, + "loss": 2.7506, + "theoretical_loss": 3.314093474129058, + "tokens_seen": 3079667712 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003593883661248931, + "loss": 2.5319, + "theoretical_loss": 3.3140880806150683, + "tokens_seen": 3079733248 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003592814371257485, + "loss": 2.5518, + "theoretical_loss": 3.314082687247986, + "tokens_seen": 3079798784 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035917450812660397, + "loss": 2.4409, + "theoretical_loss": 3.3140772940278036, + "tokens_seen": 3079864320 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035906757912745934, + "loss": 2.6684, + "theoretical_loss": 3.314071900954514, + "tokens_seen": 3079929856 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003589606501283148, + "loss": 2.4182, + "theoretical_loss": 3.31406650802811, + "tokens_seen": 3079995392 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035885372112917023, + "loss": 2.5897, + "theoretical_loss": 3.3140611152485846, + "tokens_seen": 3080060928 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035874679213002565, + "loss": 2.5569, + "theoretical_loss": 3.314055722615931, + "tokens_seen": 3080126464 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003586398631308811, + "loss": 2.6545, + "theoretical_loss": 3.314050330130141, + "tokens_seen": 3080192000 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035853293413173654, + "loss": 2.5328, + "theoretical_loss": 3.3140449377912087, + "tokens_seen": 3080257536 + }, + { + "epoch": 0.29, + "learning_rate": 0.000358426005132592, + "loss": 2.6712, + "theoretical_loss": 3.314039545599126, + "tokens_seen": 3080323072 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003583190761334474, + "loss": 2.5482, + "theoretical_loss": 3.314034153553887, + "tokens_seen": 3080388608 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035821214713430285, + "loss": 2.3456, + "theoretical_loss": 3.314028761655483, + "tokens_seen": 3080454144 + }, + { + "epoch": 0.29, + "objective/train/docs_used": 1688234, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5623414516448975, + "objective/train/theoretical_loss": 3.314023369903908, + "objective/train/tokens_used": 110570976, + "theoretical_loss": 3.314023369903908, + "tokens_seen": 3080519680 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035810521813515827, + "loss": 2.4662, + "theoretical_loss": 3.314023369903908, + "tokens_seen": 3080519680 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003579982891360137, + "loss": 2.6175, + "theoretical_loss": 3.3140179782991552, + "tokens_seen": 3080585216 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003578913601368691, + "loss": 2.6005, + "theoretical_loss": 3.314012586841216, + "tokens_seen": 3080650752 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003577844311377246, + "loss": 2.6433, + "theoretical_loss": 3.3140071955300847, + "tokens_seen": 3080716288 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035767750213858, + "loss": 2.691, + "theoretical_loss": 3.3140018043657533, + "tokens_seen": 3080781824 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003575705731394354, + "loss": 2.5925, + "theoretical_loss": 3.3139964133482147, + "tokens_seen": 3080847360 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003574636441402909, + "loss": 2.5542, + "theoretical_loss": 3.3139910224774627, + "tokens_seen": 3080912896 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035735671514114625, + "loss": 2.8091, + "theoretical_loss": 3.3139856317534893, + "tokens_seen": 3080978432 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003572497861420017, + "loss": 2.5649, + "theoretical_loss": 3.3139802411762873, + "tokens_seen": 3081043968 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035714285714285714, + "loss": 2.6339, + "theoretical_loss": 3.31397485074585, + "tokens_seen": 3081109504 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003570359281437126, + "loss": 2.5993, + "theoretical_loss": 3.313969460462171, + "tokens_seen": 3081175040 + }, + { + "epoch": 0.29, + "learning_rate": 0.000356928999144568, + "loss": 2.6593, + "theoretical_loss": 3.3139640703252415, + "tokens_seen": 3081240576 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035682207014542345, + "loss": 2.4991, + "theoretical_loss": 3.3139586803350554, + "tokens_seen": 3081306112 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003567151411462789, + "loss": 2.5606, + "theoretical_loss": 3.3139532904916056, + "tokens_seen": 3081371648 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003566082121471343, + "loss": 2.4645, + "theoretical_loss": 3.3139479007948847, + "tokens_seen": 3081437184 + }, + { + "epoch": 0.29, + "learning_rate": 0.00035650128314798976, + "loss": 2.8101, + "theoretical_loss": 3.3139425112448864, + "tokens_seen": 3081502720 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003563943541488452, + "loss": 2.6194, + "theoretical_loss": 3.313937121841602, + "tokens_seen": 3081568256 + }, + { + "epoch": 0.29, + "learning_rate": 0.0003562874251497006, + "loss": 2.6975, + "theoretical_loss": 3.3139317325850257, + "tokens_seen": 3081633792 + }, + { + "epoch": 0.29, + "learning_rate": 0.000356180496150556, + "loss": 2.5499, + "theoretical_loss": 3.31392634347515, + "tokens_seen": 3081699328 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003560735671514115, + "loss": 2.7491, + "theoretical_loss": 3.3139209545119677, + "tokens_seen": 3081764864 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035596663815226686, + "loss": 2.8247, + "theoretical_loss": 3.3139155656954715, + "tokens_seen": 3081830400 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035585970915312233, + "loss": 2.4024, + "theoretical_loss": 3.313910177025655, + "tokens_seen": 3081895936 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035575278015397775, + "loss": 2.484, + "theoretical_loss": 3.3139047885025104, + "tokens_seen": 3081961472 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003556458511548332, + "loss": 2.7873, + "theoretical_loss": 3.3138994001260307, + "tokens_seen": 3082027008 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035553892215568864, + "loss": 2.5728, + "theoretical_loss": 3.313894011896209, + "tokens_seen": 3082092544 + }, + { + "epoch": 0.3, + "objective/train/docs_used": 1689141, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2617411613464355, + "objective/train/theoretical_loss": 3.313888623813038, + "objective/train/tokens_used": 112209376, + "theoretical_loss": 3.313888623813038, + "tokens_seen": 3082158080 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035543199315654406, + "loss": 2.6148, + "theoretical_loss": 3.313888623813038, + "tokens_seen": 3082158080 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035532506415739953, + "loss": 2.4797, + "theoretical_loss": 3.313883235876511, + "tokens_seen": 3082223616 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003552181351582549, + "loss": 2.5719, + "theoretical_loss": 3.3138778480866202, + "tokens_seen": 3082289152 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035511120615911037, + "loss": 2.6098, + "theoretical_loss": 3.3138724604433594, + "tokens_seen": 3082354688 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003550042771599658, + "loss": 2.5706, + "theoretical_loss": 3.3138670729467203, + "tokens_seen": 3082420224 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003548973481608212, + "loss": 2.5948, + "theoretical_loss": 3.313861685596697, + "tokens_seen": 3082485760 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003547904191616766, + "loss": 2.6296, + "theoretical_loss": 3.3138562983932816, + "tokens_seen": 3082551296 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003546834901625321, + "loss": 2.6311, + "theoretical_loss": 3.313850911336467, + "tokens_seen": 3082616832 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035457656116338757, + "loss": 2.4267, + "theoretical_loss": 3.313845524426247, + "tokens_seen": 3082682368 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035446963216424293, + "loss": 2.4788, + "theoretical_loss": 3.313840137662613, + "tokens_seen": 3082747904 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003543627031650984, + "loss": 2.7571, + "theoretical_loss": 3.3138347510455595, + "tokens_seen": 3082813440 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003542557741659538, + "loss": 2.7525, + "theoretical_loss": 3.3138293645750783, + "tokens_seen": 3082878976 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035414884516680924, + "loss": 2.4583, + "theoretical_loss": 3.3138239782511625, + "tokens_seen": 3082944512 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035404191616766466, + "loss": 2.522, + "theoretical_loss": 3.3138185920738055, + "tokens_seen": 3083010048 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035393498716852014, + "loss": 2.5995, + "theoretical_loss": 3.3138132060429992, + "tokens_seen": 3083075584 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003538280581693755, + "loss": 2.7054, + "theoretical_loss": 3.3138078201587375, + "tokens_seen": 3083141120 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035372112917023097, + "loss": 2.7492, + "theoretical_loss": 3.313802434421013, + "tokens_seen": 3083206656 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035361420017108645, + "loss": 2.8559, + "theoretical_loss": 3.313797048829818, + "tokens_seen": 3083272192 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003535072711719418, + "loss": 2.4741, + "theoretical_loss": 3.313791663385146, + "tokens_seen": 3083337728 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003534003421727973, + "loss": 2.5676, + "theoretical_loss": 3.3137862780869902, + "tokens_seen": 3083403264 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003532934131736527, + "loss": 2.5889, + "theoretical_loss": 3.313780892935343, + "tokens_seen": 3083468800 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003531864841745082, + "loss": 2.6087, + "theoretical_loss": 3.313775507930197, + "tokens_seen": 3083534336 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035307955517536354, + "loss": 2.5886, + "theoretical_loss": 3.313770123071546, + "tokens_seen": 3083599872 + }, + { + "epoch": 0.3, + "learning_rate": 0.000352972626176219, + "loss": 2.8981, + "theoretical_loss": 3.3137647383593825, + "tokens_seen": 3083665408 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035286569717707443, + "loss": 2.6057, + "theoretical_loss": 3.313759353793699, + "tokens_seen": 3083730944 + }, + { + "epoch": 0.3, + "objective/train/docs_used": 1689663, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6877639293670654, + "objective/train/theoretical_loss": 3.313753969374489, + "objective/train/tokens_used": 113847776, + "theoretical_loss": 3.313753969374489, + "tokens_seen": 3083796480 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035275876817792985, + "loss": 2.6784, + "theoretical_loss": 3.313753969374489, + "tokens_seen": 3083796480 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035265183917878527, + "loss": 2.7501, + "theoretical_loss": 3.3137485851017447, + "tokens_seen": 3083862016 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035254491017964074, + "loss": 2.5037, + "theoretical_loss": 3.3137432009754595, + "tokens_seen": 3083927552 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035243798118049616, + "loss": 2.6851, + "theoretical_loss": 3.3137378169956264, + "tokens_seen": 3083993088 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003523310521813516, + "loss": 2.5302, + "theoretical_loss": 3.313732433162238, + "tokens_seen": 3084058624 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035222412318220705, + "loss": 2.4584, + "theoretical_loss": 3.313727049475287, + "tokens_seen": 3084124160 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003521171941830624, + "loss": 2.7286, + "theoretical_loss": 3.313721665934767, + "tokens_seen": 3084189696 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003520102651839179, + "loss": 2.5227, + "theoretical_loss": 3.3137162825406707, + "tokens_seen": 3084255232 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003519033361847733, + "loss": 2.6379, + "theoretical_loss": 3.3137108992929907, + "tokens_seen": 3084320768 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003517964071856288, + "loss": 2.503, + "theoretical_loss": 3.31370551619172, + "tokens_seen": 3084386304 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035168947818648414, + "loss": 2.6552, + "theoretical_loss": 3.3137001332368516, + "tokens_seen": 3084451840 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003515825491873396, + "loss": 2.829, + "theoretical_loss": 3.313694750428378, + "tokens_seen": 3084517376 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003514756201881951, + "loss": 2.6837, + "theoretical_loss": 3.313689367766293, + "tokens_seen": 3084582912 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035136869118905045, + "loss": 2.5714, + "theoretical_loss": 3.313683985250589, + "tokens_seen": 3084648448 + }, + { + "epoch": 0.3, + "learning_rate": 0.0003512617621899059, + "loss": 2.7012, + "theoretical_loss": 3.3136786028812586, + "tokens_seen": 3084713984 + }, + { + "epoch": 0.3, + "learning_rate": 0.00035115483319076135, + "loss": 2.7778, + "theoretical_loss": 3.313673220658295, + "tokens_seen": 3084779520 + }, + { + "epoch": 0.31, + "learning_rate": 0.00035104790419161676, + "loss": 2.6725, + "theoretical_loss": 3.3136678385816913, + "tokens_seen": 3084845056 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003509409751924722, + "loss": 2.5281, + "theoretical_loss": 3.31366245665144, + "tokens_seen": 3084910592 + }, + { + "epoch": 0.31, + "learning_rate": 0.00035083404619332766, + "loss": 2.601, + "theoretical_loss": 3.3136570748675345, + "tokens_seen": 3084976128 + }, + { + "epoch": 0.31, + "learning_rate": 0.000350727117194183, + "loss": 2.6085, + "theoretical_loss": 3.3136516932299673, + "tokens_seen": 3085041664 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003506201881950385, + "loss": 2.6271, + "theoretical_loss": 3.313646311738731, + "tokens_seen": 3085107200 + }, + { + "epoch": 0.31, + "learning_rate": 0.00035051325919589397, + "loss": 2.6376, + "theoretical_loss": 3.3136409303938197, + "tokens_seen": 3085172736 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003504063301967494, + "loss": 2.7566, + "theoretical_loss": 3.3136355491952254, + "tokens_seen": 3085238272 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003502994011976048, + "loss": 2.4515, + "theoretical_loss": 3.313630168142941, + "tokens_seen": 3085303808 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003501924721984602, + "loss": 2.6709, + "theoretical_loss": 3.31362478723696, + "tokens_seen": 3085369344 + }, + { + "epoch": 0.31, + "objective/train/docs_used": 1690915, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5282833576202393, + "objective/train/theoretical_loss": 3.3136194064772746, + "objective/train/tokens_used": 115486176, + "theoretical_loss": 3.3136194064772746, + "tokens_seen": 3085434880 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003500855431993157, + "loss": 2.5315, + "theoretical_loss": 3.3136194064772746, + "tokens_seen": 3085434880 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034997861420017106, + "loss": 2.7553, + "theoretical_loss": 3.313614025863878, + "tokens_seen": 3085500416 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034987168520102653, + "loss": 2.563, + "theoretical_loss": 3.3136086453967635, + "tokens_seen": 3085565952 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034976475620188195, + "loss": 2.7593, + "theoretical_loss": 3.3136032650759235, + "tokens_seen": 3085631488 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034965782720273737, + "loss": 2.7253, + "theoretical_loss": 3.313597884901351, + "tokens_seen": 3085697024 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034955089820359284, + "loss": 2.5465, + "theoretical_loss": 3.313592504873039, + "tokens_seen": 3085762560 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034944396920444826, + "loss": 2.6819, + "theoretical_loss": 3.3135871249909803, + "tokens_seen": 3085828096 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034933704020530373, + "loss": 2.7642, + "theoretical_loss": 3.3135817452551684, + "tokens_seen": 3085893632 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003492301112061591, + "loss": 2.4404, + "theoretical_loss": 3.3135763656655954, + "tokens_seen": 3085959168 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034912318220701457, + "loss": 2.6891, + "theoretical_loss": 3.3135709862222544, + "tokens_seen": 3086024704 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034901625320787, + "loss": 2.4828, + "theoretical_loss": 3.313565606925139, + "tokens_seen": 3086090240 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003489093242087254, + "loss": 2.685, + "theoretical_loss": 3.3135602277742415, + "tokens_seen": 3086155776 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003488023952095808, + "loss": 2.6188, + "theoretical_loss": 3.313554848769555, + "tokens_seen": 3086221312 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003486954662104363, + "loss": 2.659, + "theoretical_loss": 3.313549469911072, + "tokens_seen": 3086286848 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034858853721129166, + "loss": 2.4088, + "theoretical_loss": 3.313544091198786, + "tokens_seen": 3086352384 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034848160821214714, + "loss": 2.5368, + "theoretical_loss": 3.3135387126326896, + "tokens_seen": 3086417920 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003483746792130026, + "loss": 2.668, + "theoretical_loss": 3.313533334212776, + "tokens_seen": 3086483456 + }, + { + "epoch": 0.31, + "learning_rate": 0.000348267750213858, + "loss": 2.526, + "theoretical_loss": 3.313527955939038, + "tokens_seen": 3086548992 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034816082121471345, + "loss": 2.5678, + "theoretical_loss": 3.3135225778114683, + "tokens_seen": 3086614528 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034805389221556886, + "loss": 2.5425, + "theoretical_loss": 3.31351719983006, + "tokens_seen": 3086680064 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034794696321642434, + "loss": 2.4835, + "theoretical_loss": 3.313511821994806, + "tokens_seen": 3086745600 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003478400342172797, + "loss": 2.6388, + "theoretical_loss": 3.3135064443056996, + "tokens_seen": 3086811136 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003477331052181352, + "loss": 2.6737, + "theoretical_loss": 3.313501066762733, + "tokens_seen": 3086876672 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003476261762189906, + "loss": 2.7425, + "theoretical_loss": 3.3134956893658996, + "tokens_seen": 3086942208 + }, + { + "epoch": 0.31, + "learning_rate": 0.000347519247219846, + "loss": 2.5201, + "theoretical_loss": 3.3134903121151926, + "tokens_seen": 3087007744 + }, + { + "epoch": 0.31, + "objective/train/docs_used": 1692034, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6510801315307617, + "objective/train/theoretical_loss": 3.313484935010604, + "objective/train/tokens_used": 117124576, + "theoretical_loss": 3.313484935010604, + "tokens_seen": 3087073280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003474123182207015, + "loss": 2.6653, + "theoretical_loss": 3.313484935010604, + "tokens_seen": 3087073280 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003473053892215569, + "loss": 2.8366, + "theoretical_loss": 3.3134795580521277, + "tokens_seen": 3087138816 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003471984602224123, + "loss": 2.563, + "theoretical_loss": 3.313474181239756, + "tokens_seen": 3087204352 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034709153122326774, + "loss": 2.6236, + "theoretical_loss": 3.313468804573482, + "tokens_seen": 3087269888 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003469846022241232, + "loss": 2.7436, + "theoretical_loss": 3.3134634280532986, + "tokens_seen": 3087335424 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003468776732249786, + "loss": 2.5464, + "theoretical_loss": 3.313458051679199, + "tokens_seen": 3087400960 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034677074422583405, + "loss": 2.7689, + "theoretical_loss": 3.3134526754511757, + "tokens_seen": 3087466496 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034666381522668947, + "loss": 2.7398, + "theoretical_loss": 3.3134472993692223, + "tokens_seen": 3087532032 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034655688622754494, + "loss": 2.4903, + "theoretical_loss": 3.3134419234333308, + "tokens_seen": 3087597568 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034644995722840036, + "loss": 2.5234, + "theoretical_loss": 3.3134365476434953, + "tokens_seen": 3087663104 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003463430282292558, + "loss": 2.6597, + "theoretical_loss": 3.3134311719997074, + "tokens_seen": 3087728640 + }, + { + "epoch": 0.31, + "learning_rate": 0.00034623609923011125, + "loss": 2.5869, + "theoretical_loss": 3.313425796501961, + "tokens_seen": 3087794176 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003461291702309666, + "loss": 2.5968, + "theoretical_loss": 3.3134204211502487, + "tokens_seen": 3087859712 + }, + { + "epoch": 0.31, + "learning_rate": 0.0003460222412318221, + "loss": 2.6379, + "theoretical_loss": 3.3134150459445633, + "tokens_seen": 3087925248 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003459153122326775, + "loss": 2.6229, + "theoretical_loss": 3.313409670884898, + "tokens_seen": 3087990784 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034580838323353293, + "loss": 2.4611, + "theoretical_loss": 3.313404295971245, + "tokens_seen": 3088056320 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034570145423438835, + "loss": 2.7662, + "theoretical_loss": 3.313398921203599, + "tokens_seen": 3088121856 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003455945252352438, + "loss": 2.6835, + "theoretical_loss": 3.313393546581951, + "tokens_seen": 3088187392 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034548759623609924, + "loss": 2.4955, + "theoretical_loss": 3.313388172106295, + "tokens_seen": 3088252928 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034538066723695466, + "loss": 2.783, + "theoretical_loss": 3.3133827977766237, + "tokens_seen": 3088318464 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034527373823781013, + "loss": 2.7531, + "theoretical_loss": 3.3133774235929296, + "tokens_seen": 3088384000 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034516680923866555, + "loss": 2.6813, + "theoretical_loss": 3.3133720495552064, + "tokens_seen": 3088449536 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034505988023952097, + "loss": 2.52, + "theoretical_loss": 3.313366675663447, + "tokens_seen": 3088515072 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003449529512403764, + "loss": 2.547, + "theoretical_loss": 3.3133613019176433, + "tokens_seen": 3088580608 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034484602224123186, + "loss": 2.4318, + "theoretical_loss": 3.3133559283177894, + "tokens_seen": 3088646144 + }, + { + "epoch": 0.32, + "objective/train/docs_used": 1692731, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.209240436553955, + "objective/train/theoretical_loss": 3.3133505548638778, + "objective/train/tokens_used": 118762976, + "theoretical_loss": 3.3133505548638778, + "tokens_seen": 3088711680 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003447390932420872, + "loss": 2.5568, + "theoretical_loss": 3.3133505548638778, + "tokens_seen": 3088711680 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003446321642429427, + "loss": 2.6515, + "theoretical_loss": 3.3133451815559014, + "tokens_seen": 3088777216 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034452523524379817, + "loss": 2.9243, + "theoretical_loss": 3.3133398083938532, + "tokens_seen": 3088842752 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034441830624465353, + "loss": 2.7112, + "theoretical_loss": 3.313334435377726, + "tokens_seen": 3088908288 + }, + { + "epoch": 0.32, + "learning_rate": 0.000344311377245509, + "loss": 2.5896, + "theoretical_loss": 3.313329062507513, + "tokens_seen": 3088973824 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003442044482463644, + "loss": 2.6759, + "theoretical_loss": 3.3133236897832066, + "tokens_seen": 3089039360 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034409751924721984, + "loss": 2.6537, + "theoretical_loss": 3.313318317204801, + "tokens_seen": 3089104896 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034399059024807526, + "loss": 2.6525, + "theoretical_loss": 3.3133129447722873, + "tokens_seen": 3089170432 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034388366124893073, + "loss": 2.6344, + "theoretical_loss": 3.3133075724856598, + "tokens_seen": 3089235968 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034377673224978615, + "loss": 2.5654, + "theoretical_loss": 3.313302200344911, + "tokens_seen": 3089301504 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034366980325064157, + "loss": 2.558, + "theoretical_loss": 3.313296828350034, + "tokens_seen": 3089367040 + }, + { + "epoch": 0.32, + "learning_rate": 0.000343562874251497, + "loss": 2.4973, + "theoretical_loss": 3.313291456501022, + "tokens_seen": 3089432576 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034345594525235246, + "loss": 2.6811, + "theoretical_loss": 3.313286084797867, + "tokens_seen": 3089498112 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003433490162532079, + "loss": 2.7215, + "theoretical_loss": 3.3132807132405633, + "tokens_seen": 3089563648 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003432420872540633, + "loss": 2.6857, + "theoretical_loss": 3.3132753418291023, + "tokens_seen": 3089629184 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034313515825491877, + "loss": 2.5835, + "theoretical_loss": 3.3132699705634785, + "tokens_seen": 3089694720 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034302822925577414, + "loss": 2.8779, + "theoretical_loss": 3.3132645994436833, + "tokens_seen": 3089760256 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003429213002566296, + "loss": 2.644, + "theoretical_loss": 3.313259228469711, + "tokens_seen": 3089825792 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034281437125748503, + "loss": 2.7065, + "theoretical_loss": 3.313253857641554, + "tokens_seen": 3089891328 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003427074422583405, + "loss": 2.6218, + "theoretical_loss": 3.3132484869592047, + "tokens_seen": 3089956864 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034260051325919587, + "loss": 2.8176, + "theoretical_loss": 3.3132431164226572, + "tokens_seen": 3090022400 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034249358426005134, + "loss": 2.7892, + "theoretical_loss": 3.3132377460319034, + "tokens_seen": 3090087936 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003423866552609068, + "loss": 2.6782, + "theoretical_loss": 3.313232375786937, + "tokens_seen": 3090153472 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003422797262617622, + "loss": 2.6079, + "theoretical_loss": 3.31322700568775, + "tokens_seen": 3090219008 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034217279726261765, + "loss": 2.6597, + "theoretical_loss": 3.3132216357343367, + "tokens_seen": 3090284544 + }, + { + "epoch": 0.32, + "objective/train/docs_used": 1693616, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.7858035564422607, + "objective/train/theoretical_loss": 3.3132162659266893, + "objective/train/tokens_used": 120401376, + "theoretical_loss": 3.3132162659266893, + "tokens_seen": 3090350080 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034206586826347307, + "loss": 2.715, + "theoretical_loss": 3.3132162659266893, + "tokens_seen": 3090350080 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003419589392643285, + "loss": 2.7105, + "theoretical_loss": 3.313210896264801, + "tokens_seen": 3090415616 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003418520102651839, + "loss": 2.7826, + "theoretical_loss": 3.3132055267486638, + "tokens_seen": 3090481152 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003417450812660394, + "loss": 2.5884, + "theoretical_loss": 3.3132001573782714, + "tokens_seen": 3090546688 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034163815226689474, + "loss": 2.5685, + "theoretical_loss": 3.3131947881536172, + "tokens_seen": 3090612224 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003415312232677502, + "loss": 2.764, + "theoretical_loss": 3.3131894190746936, + "tokens_seen": 3090677760 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003414242942686057, + "loss": 2.5703, + "theoretical_loss": 3.3131840501414938, + "tokens_seen": 3090743296 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003413173652694611, + "loss": 2.7023, + "theoretical_loss": 3.3131786813540107, + "tokens_seen": 3090808832 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003412104362703165, + "loss": 2.6606, + "theoretical_loss": 3.313173312712237, + "tokens_seen": 3090874368 + }, + { + "epoch": 0.32, + "learning_rate": 0.00034110350727117194, + "loss": 2.6269, + "theoretical_loss": 3.3131679442161657, + "tokens_seen": 3090939904 + }, + { + "epoch": 0.32, + "learning_rate": 0.0003409965782720274, + "loss": 2.7679, + "theoretical_loss": 3.31316257586579, + "tokens_seen": 3091005440 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003408896492728828, + "loss": 2.709, + "theoretical_loss": 3.3131572076611024, + "tokens_seen": 3091070976 + }, + { + "epoch": 0.33, + "learning_rate": 0.00034078272027373825, + "loss": 2.808, + "theoretical_loss": 3.3131518396020967, + "tokens_seen": 3091136512 + }, + { + "epoch": 0.33, + "learning_rate": 0.00034067579127459367, + "loss": 2.4574, + "theoretical_loss": 3.313146471688765, + "tokens_seen": 3091202048 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003405688622754491, + "loss": 2.8083, + "theoretical_loss": 3.313141103921101, + "tokens_seen": 3091267584 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003404619332763045, + "loss": 2.4988, + "theoretical_loss": 3.3131357362990967, + "tokens_seen": 3091333120 + }, + { + "epoch": 0.33, + "learning_rate": 0.00034035500427716, + "loss": 2.8995, + "theoretical_loss": 3.3131303688227463, + "tokens_seen": 3091398656 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003402480752780154, + "loss": 2.8102, + "theoretical_loss": 3.3131250014920415, + "tokens_seen": 3091464192 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003401411462788708, + "loss": 2.557, + "theoretical_loss": 3.313119634306976, + "tokens_seen": 3091529728 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003400342172797263, + "loss": 2.6261, + "theoretical_loss": 3.313114267267543, + "tokens_seen": 3091595264 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003399272882805817, + "loss": 2.8272, + "theoretical_loss": 3.313108900373735, + "tokens_seen": 3091660800 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033982035928143713, + "loss": 2.6256, + "theoretical_loss": 3.3131035336255446, + "tokens_seen": 3091726336 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033971343028229255, + "loss": 2.8236, + "theoretical_loss": 3.3130981670229653, + "tokens_seen": 3091791872 + }, + { + "epoch": 0.33, + "learning_rate": 0.000339606501283148, + "loss": 2.5894, + "theoretical_loss": 3.31309280056599, + "tokens_seen": 3091857408 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003394995722840034, + "loss": 2.6385, + "theoretical_loss": 3.3130874342546117, + "tokens_seen": 3091922944 + }, + { + "epoch": 0.33, + "objective/train/docs_used": 1694218, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4601612091064453, + "objective/train/theoretical_loss": 3.3130820680888236, + "objective/train/tokens_used": 122039776, + "theoretical_loss": 3.3130820680888236, + "tokens_seen": 3091988480 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033939264328485886, + "loss": 2.6435, + "theoretical_loss": 3.3130820680888236, + "tokens_seen": 3091988480 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033928571428571433, + "loss": 2.7692, + "theoretical_loss": 3.313076702068618, + "tokens_seen": 3092054016 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003391787852865697, + "loss": 2.5241, + "theoretical_loss": 3.313071336193988, + "tokens_seen": 3092119552 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033907185628742517, + "loss": 2.7312, + "theoretical_loss": 3.313065970464927, + "tokens_seen": 3092185088 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003389649272882806, + "loss": 2.5237, + "theoretical_loss": 3.313060604881428, + "tokens_seen": 3092250624 + }, + { + "epoch": 0.33, + "learning_rate": 0.000338857998289136, + "loss": 2.6796, + "theoretical_loss": 3.3130552394434836, + "tokens_seen": 3092316160 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003387510692899914, + "loss": 2.459, + "theoretical_loss": 3.313049874151087, + "tokens_seen": 3092381696 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003386441402908469, + "loss": 2.5755, + "theoretical_loss": 3.313044509004231, + "tokens_seen": 3092447232 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003385372112917023, + "loss": 2.7496, + "theoretical_loss": 3.313039144002908, + "tokens_seen": 3092512768 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033843028229255773, + "loss": 2.8441, + "theoretical_loss": 3.313033779147112, + "tokens_seen": 3092578304 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003383233532934132, + "loss": 2.5484, + "theoretical_loss": 3.313028414436836, + "tokens_seen": 3092643840 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003382164242942686, + "loss": 2.8492, + "theoretical_loss": 3.313023049872072, + "tokens_seen": 3092709376 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033810949529512404, + "loss": 2.7047, + "theoretical_loss": 3.313017685452814, + "tokens_seen": 3092774912 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033800256629597946, + "loss": 2.5878, + "theoretical_loss": 3.3130123211790536, + "tokens_seen": 3092840448 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033789563729683494, + "loss": 2.8378, + "theoretical_loss": 3.3130069570507854, + "tokens_seen": 3092905984 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003377887082976903, + "loss": 2.7452, + "theoretical_loss": 3.3130015930680012, + "tokens_seen": 3092971520 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003376817792985458, + "loss": 2.5919, + "theoretical_loss": 3.312996229230695, + "tokens_seen": 3093037056 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003375748502994012, + "loss": 2.6929, + "theoretical_loss": 3.3129908655388585, + "tokens_seen": 3093102592 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033746792130025667, + "loss": 2.656, + "theoretical_loss": 3.312985501992485, + "tokens_seen": 3093168128 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003373609923011121, + "loss": 2.6176, + "theoretical_loss": 3.3129801385915685, + "tokens_seen": 3093233664 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003372540633019675, + "loss": 2.6681, + "theoretical_loss": 3.312974775336101, + "tokens_seen": 3093299200 + }, + { + "epoch": 0.33, + "learning_rate": 0.000337147134302823, + "loss": 2.6444, + "theoretical_loss": 3.3129694122260758, + "tokens_seen": 3093364736 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033704020530367834, + "loss": 2.6848, + "theoretical_loss": 3.312964049261486, + "tokens_seen": 3093430272 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003369332763045338, + "loss": 2.8115, + "theoretical_loss": 3.3129586864423244, + "tokens_seen": 3093495808 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033682634730538923, + "loss": 2.5661, + "theoretical_loss": 3.3129533237685838, + "tokens_seen": 3093561344 + }, + { + "epoch": 0.33, + "objective/train/docs_used": 1695039, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6366329193115234, + "objective/train/theoretical_loss": 3.312947961240257, + "objective/train/tokens_used": 123678176, + "theoretical_loss": 3.312947961240257, + "tokens_seen": 3093626880 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033671941830624465, + "loss": 2.558, + "theoretical_loss": 3.312947961240257, + "tokens_seen": 3093626880 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033661248930710007, + "loss": 2.8355, + "theoretical_loss": 3.3129425988573376, + "tokens_seen": 3093692416 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033650556030795554, + "loss": 2.6669, + "theoretical_loss": 3.3129372366198186, + "tokens_seen": 3093757952 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003363986313088109, + "loss": 2.7581, + "theoretical_loss": 3.3129318745276923, + "tokens_seen": 3093823488 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003362917023096664, + "loss": 2.7045, + "theoretical_loss": 3.3129265125809524, + "tokens_seen": 3093889024 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033618477331052185, + "loss": 2.482, + "theoretical_loss": 3.3129211507795913, + "tokens_seen": 3093954560 + }, + { + "epoch": 0.33, + "learning_rate": 0.00033607784431137727, + "loss": 2.6268, + "theoretical_loss": 3.3129157891236023, + "tokens_seen": 3094020096 + }, + { + "epoch": 0.33, + "learning_rate": 0.0003359709153122327, + "loss": 2.5398, + "theoretical_loss": 3.312910427612978, + "tokens_seen": 3094085632 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003358639863130881, + "loss": 2.5242, + "theoretical_loss": 3.312905066247712, + "tokens_seen": 3094151168 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003357570573139436, + "loss": 2.6828, + "theoretical_loss": 3.312899705027797, + "tokens_seen": 3094216704 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033565012831479894, + "loss": 2.6339, + "theoretical_loss": 3.312894343953226, + "tokens_seen": 3094282240 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003355431993156544, + "loss": 2.5882, + "theoretical_loss": 3.312888983023991, + "tokens_seen": 3094347776 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033543627031650984, + "loss": 2.7092, + "theoretical_loss": 3.312883622240087, + "tokens_seen": 3094413312 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033532934131736525, + "loss": 2.6759, + "theoretical_loss": 3.3128782616015053, + "tokens_seen": 3094478848 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033522241231822073, + "loss": 2.6668, + "theoretical_loss": 3.3128729011082396, + "tokens_seen": 3094544384 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033511548331907615, + "loss": 2.5897, + "theoretical_loss": 3.312867540760283, + "tokens_seen": 3094609920 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033500855431993156, + "loss": 2.6341, + "theoretical_loss": 3.312862180557628, + "tokens_seen": 3094675456 + }, + { + "epoch": 0.34, + "learning_rate": 0.000334901625320787, + "loss": 2.5726, + "theoretical_loss": 3.3128568205002678, + "tokens_seen": 3094740992 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033479469632164246, + "loss": 2.8136, + "theoretical_loss": 3.3128514605881954, + "tokens_seen": 3094806528 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003346877673224979, + "loss": 2.7594, + "theoretical_loss": 3.3128461008214036, + "tokens_seen": 3094872064 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003345808383233533, + "loss": 2.7734, + "theoretical_loss": 3.312840741199886, + "tokens_seen": 3094937600 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003344739093242087, + "loss": 2.6427, + "theoretical_loss": 3.3128353817236347, + "tokens_seen": 3095003136 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003343669803250642, + "loss": 2.5843, + "theoretical_loss": 3.3128300223926437, + "tokens_seen": 3095068672 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003342600513259196, + "loss": 2.8907, + "theoretical_loss": 3.312824663206905, + "tokens_seen": 3095134208 + }, + { + "epoch": 0.34, + "learning_rate": 0.000334153122326775, + "loss": 2.4415, + "theoretical_loss": 3.3128193041664122, + "tokens_seen": 3095199744 + }, + { + "epoch": 0.34, + "objective/train/docs_used": 1695481, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.2710697650909424, + "objective/train/theoretical_loss": 3.312813945271158, + "objective/train/tokens_used": 125316576, + "theoretical_loss": 3.312813945271158, + "tokens_seen": 3095265280 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003340461933276305, + "loss": 2.4961, + "theoretical_loss": 3.312813945271158, + "tokens_seen": 3095265280 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033393926432848586, + "loss": 2.578, + "theoretical_loss": 3.312808586521135, + "tokens_seen": 3095330816 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033383233532934133, + "loss": 2.5332, + "theoretical_loss": 3.3128032279163375, + "tokens_seen": 3095396352 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033372540633019675, + "loss": 2.7109, + "theoretical_loss": 3.3127978694567575, + "tokens_seen": 3095461888 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033361847733105217, + "loss": 2.732, + "theoretical_loss": 3.312792511142388, + "tokens_seen": 3095527424 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003335115483319076, + "loss": 2.7667, + "theoretical_loss": 3.3127871529732222, + "tokens_seen": 3095592960 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033340461933276306, + "loss": 2.6638, + "theoretical_loss": 3.3127817949492533, + "tokens_seen": 3095658496 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033329769033361853, + "loss": 2.5952, + "theoretical_loss": 3.3127764370704735, + "tokens_seen": 3095724032 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003331907613344739, + "loss": 2.7015, + "theoretical_loss": 3.3127710793368768, + "tokens_seen": 3095789568 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033308383233532937, + "loss": 2.8061, + "theoretical_loss": 3.3127657217484554, + "tokens_seen": 3095855104 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003329769033361848, + "loss": 2.8986, + "theoretical_loss": 3.312760364305203, + "tokens_seen": 3095920640 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003328699743370402, + "loss": 2.5355, + "theoretical_loss": 3.312755007007112, + "tokens_seen": 3095986176 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033276304533789563, + "loss": 2.6996, + "theoretical_loss": 3.3127496498541755, + "tokens_seen": 3096051712 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003326561163387511, + "loss": 2.4998, + "theoretical_loss": 3.3127442928463866, + "tokens_seen": 3096117248 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033254918733960646, + "loss": 2.7458, + "theoretical_loss": 3.3127389359837385, + "tokens_seen": 3096182784 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033244225834046194, + "loss": 2.6944, + "theoretical_loss": 3.312733579266224, + "tokens_seen": 3096248320 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033233532934131736, + "loss": 2.5846, + "theoretical_loss": 3.3127282226938357, + "tokens_seen": 3096313856 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033222840034217283, + "loss": 2.6838, + "theoretical_loss": 3.3127228662665673, + "tokens_seen": 3096379392 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033212147134302825, + "loss": 2.7159, + "theoretical_loss": 3.3127175099844113, + "tokens_seen": 3096444928 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033201454234388367, + "loss": 2.6473, + "theoretical_loss": 3.312712153847361, + "tokens_seen": 3096510464 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033190761334473914, + "loss": 2.7512, + "theoretical_loss": 3.3127067978554092, + "tokens_seen": 3096576000 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003318006843455945, + "loss": 2.7078, + "theoretical_loss": 3.3127014420085494, + "tokens_seen": 3096641536 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033169375534645, + "loss": 2.7268, + "theoretical_loss": 3.3126960863067736, + "tokens_seen": 3096707072 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003315868263473054, + "loss": 2.7363, + "theoretical_loss": 3.3126907307500755, + "tokens_seen": 3096772608 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003314798973481608, + "loss": 2.5158, + "theoretical_loss": 3.3126853753384484, + "tokens_seen": 3096838144 + }, + { + "epoch": 0.34, + "objective/train/docs_used": 1696582, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.6376142501831055, + "objective/train/theoretical_loss": 3.3126800200718844, + "objective/train/tokens_used": 126954976, + "theoretical_loss": 3.3126800200718844, + "tokens_seen": 3096903680 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033137296834901623, + "loss": 2.6679, + "theoretical_loss": 3.3126800200718844, + "tokens_seen": 3096903680 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003312660393498717, + "loss": 2.6862, + "theoretical_loss": 3.312674664950377, + "tokens_seen": 3096969216 + }, + { + "epoch": 0.34, + "learning_rate": 0.0003311591103507271, + "loss": 2.81, + "theoretical_loss": 3.312669309973919, + "tokens_seen": 3097034752 + }, + { + "epoch": 0.34, + "learning_rate": 0.00033105218135158254, + "loss": 2.7885, + "theoretical_loss": 3.3126639551425043, + "tokens_seen": 3097100288 + }, + { + "epoch": 0.34, + "learning_rate": 0.000330945252352438, + "loss": 2.6725, + "theoretical_loss": 3.3126586004561247, + "tokens_seen": 3097165824 + }, + { + "epoch": 0.35, + "learning_rate": 0.00033083832335329343, + "loss": 2.6585, + "theoretical_loss": 3.3126532459147735, + "tokens_seen": 3097231360 + }, + { + "epoch": 0.35, + "learning_rate": 0.00033073139435414885, + "loss": 2.6805, + "theoretical_loss": 3.312647891518444, + "tokens_seen": 3097296896 + }, + { + "epoch": 0.35, + "learning_rate": 0.00033062446535500427, + "loss": 2.8118, + "theoretical_loss": 3.3126425372671293, + "tokens_seen": 3097362432 + }, + { + "epoch": 0.35, + "learning_rate": 0.00033051753635585974, + "loss": 2.5444, + "theoretical_loss": 3.312637183160822, + "tokens_seen": 3097427968 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003304106073567151, + "loss": 2.627, + "theoretical_loss": 3.3126318291995154, + "tokens_seen": 3097493504 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003303036783575706, + "loss": 2.8452, + "theoretical_loss": 3.3126264753832024, + "tokens_seen": 3097559040 + }, + { + "epoch": 0.35, + "learning_rate": 0.00033019674935842605, + "loss": 2.5281, + "theoretical_loss": 3.312621121711876, + "tokens_seen": 3097624576 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003300898203592814, + "loss": 2.61, + "theoretical_loss": 3.312615768185529, + "tokens_seen": 3097690112 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003299828913601369, + "loss": 2.6085, + "theoretical_loss": 3.312610414804155, + "tokens_seen": 3097755648 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003298759623609923, + "loss": 2.5078, + "theoretical_loss": 3.3126050615677465, + "tokens_seen": 3097821184 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032976903336184773, + "loss": 2.6693, + "theoretical_loss": 3.312599708476297, + "tokens_seen": 3097886720 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032966210436270315, + "loss": 2.387, + "theoretical_loss": 3.3125943555297983, + "tokens_seen": 3097952256 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003295551753635586, + "loss": 2.6741, + "theoretical_loss": 3.3125890027282447, + "tokens_seen": 3098017792 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032944824636441404, + "loss": 2.6333, + "theoretical_loss": 3.3125836500716286, + "tokens_seen": 3098083328 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032934131736526946, + "loss": 2.8194, + "theoretical_loss": 3.3125782975599436, + "tokens_seen": 3098148864 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032923438836612493, + "loss": 2.7274, + "theoretical_loss": 3.312572945193182, + "tokens_seen": 3098214400 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032912745936698035, + "loss": 2.573, + "theoretical_loss": 3.312567592971337, + "tokens_seen": 3098279936 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032902053036783577, + "loss": 2.795, + "theoretical_loss": 3.312562240894402, + "tokens_seen": 3098345472 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003289136013686912, + "loss": 2.467, + "theoretical_loss": 3.3125568889623693, + "tokens_seen": 3098411008 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032880667236954666, + "loss": 2.4533, + "theoretical_loss": 3.3125515371752323, + "tokens_seen": 3098476544 + }, + { + "epoch": 0.35, + "objective/train/docs_used": 1697795, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.5208029747009277, + "objective/train/theoretical_loss": 3.3125461855329847, + "objective/train/tokens_used": 128593376, + "theoretical_loss": 3.3125461855329847, + "tokens_seen": 3098542080 + }, + { + "epoch": 0.35, + "learning_rate": 0.000328699743370402, + "loss": 2.6665, + "theoretical_loss": 3.3125461855329847, + "tokens_seen": 3098542080 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003285928143712575, + "loss": 2.536, + "theoretical_loss": 3.3125408340356186, + "tokens_seen": 3098607616 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003284858853721129, + "loss": 2.8448, + "theoretical_loss": 3.312535482683127, + "tokens_seen": 3098673152 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032837895637296833, + "loss": 2.6499, + "theoretical_loss": 3.312530131475503, + "tokens_seen": 3098738688 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032827202737382375, + "loss": 2.5347, + "theoretical_loss": 3.3125247804127405, + "tokens_seen": 3098804224 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003281650983746792, + "loss": 2.5525, + "theoretical_loss": 3.3125194294948312, + "tokens_seen": 3098869760 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003280581693755347, + "loss": 2.3905, + "theoretical_loss": 3.312514078721769, + "tokens_seen": 3098935296 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032795124037639006, + "loss": 2.6376, + "theoretical_loss": 3.3125087280935466, + "tokens_seen": 3099000832 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032784431137724553, + "loss": 2.5707, + "theoretical_loss": 3.312503377610157, + "tokens_seen": 3099066368 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032773738237810095, + "loss": 2.5814, + "theoretical_loss": 3.3124980272715936, + "tokens_seen": 3099131904 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032763045337895637, + "loss": 2.6109, + "theoretical_loss": 3.3124926770778487, + "tokens_seen": 3099197440 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003275235243798118, + "loss": 2.6777, + "theoretical_loss": 3.3124873270289155, + "tokens_seen": 3099262976 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032741659538066726, + "loss": 2.7706, + "theoretical_loss": 3.3124819771247878, + "tokens_seen": 3099328512 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032730966638152263, + "loss": 2.5438, + "theoretical_loss": 3.3124766273654576, + "tokens_seen": 3099394048 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003272027373823781, + "loss": 2.4304, + "theoretical_loss": 3.3124712777509187, + "tokens_seen": 3099459584 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003270958083832336, + "loss": 2.4997, + "theoretical_loss": 3.3124659282811635, + "tokens_seen": 3099525120 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032698887938408894, + "loss": 2.3794, + "theoretical_loss": 3.312460578956186, + "tokens_seen": 3099590656 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003268819503849444, + "loss": 2.5964, + "theoretical_loss": 3.3124552297759777, + "tokens_seen": 3099656192 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032677502138579983, + "loss": 2.8176, + "theoretical_loss": 3.3124498807405325, + "tokens_seen": 3099721728 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003266680923866553, + "loss": 2.844, + "theoretical_loss": 3.312444531849844, + "tokens_seen": 3099787264 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032656116338751067, + "loss": 2.7205, + "theoretical_loss": 3.312439183103904, + "tokens_seen": 3099852800 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032645423438836614, + "loss": 2.5019, + "theoretical_loss": 3.3124338345027065, + "tokens_seen": 3099918336 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032634730538922156, + "loss": 2.817, + "theoretical_loss": 3.312428486046244, + "tokens_seen": 3099983872 + }, + { + "epoch": 0.35, + "learning_rate": 0.000326240376390077, + "loss": 2.7284, + "theoretical_loss": 3.3124231377345095, + "tokens_seen": 3100049408 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032613344739093245, + "loss": 2.4798, + "theoretical_loss": 3.312417789567496, + "tokens_seen": 3100114944 + }, + { + "epoch": 0.35, + "objective/train/docs_used": 1698452, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.4108951091766357, + "objective/train/theoretical_loss": 3.3124124415451974, + "objective/train/tokens_used": 130231776, + "theoretical_loss": 3.3124124415451974, + "tokens_seen": 3100180480 + }, + { + "epoch": 0.35, + "learning_rate": 0.00032602651839178787, + "loss": 2.7243, + "theoretical_loss": 3.3124124415451974, + "tokens_seen": 3100180480 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003259195893926433, + "loss": 2.6773, + "theoretical_loss": 3.3124070936676056, + "tokens_seen": 3100246016 + }, + { + "epoch": 0.35, + "learning_rate": 0.0003258126603934987, + "loss": 2.6556, + "theoretical_loss": 3.3124017459347144, + "tokens_seen": 3100311552 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003257057313943542, + "loss": 2.5404, + "theoretical_loss": 3.312396398346516, + "tokens_seen": 3100377088 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003255988023952096, + "loss": 2.6344, + "theoretical_loss": 3.3123910509030043, + "tokens_seen": 3100442624 + }, + { + "epoch": 0.36, + "learning_rate": 0.000325491873396065, + "loss": 2.2866, + "theoretical_loss": 3.3123857036041717, + "tokens_seen": 3100508160 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032538494439692043, + "loss": 2.657, + "theoretical_loss": 3.3123803564500114, + "tokens_seen": 3100573696 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003252780153977759, + "loss": 2.7781, + "theoretical_loss": 3.3123750094405167, + "tokens_seen": 3100639232 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003251710863986313, + "loss": 2.4673, + "theoretical_loss": 3.3123696625756804, + "tokens_seen": 3100704768 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032506415739948674, + "loss": 2.599, + "theoretical_loss": 3.3123643158554956, + "tokens_seen": 3100770304 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003249572284003422, + "loss": 2.5754, + "theoretical_loss": 3.312358969279955, + "tokens_seen": 3100835840 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003248502994011976, + "loss": 2.5899, + "theoretical_loss": 3.312353622849052, + "tokens_seen": 3100901376 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032474337040205305, + "loss": 2.4501, + "theoretical_loss": 3.31234827656278, + "tokens_seen": 3100966912 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003246364414029085, + "loss": 2.611, + "theoretical_loss": 3.3123429304211314, + "tokens_seen": 3101032448 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003245295124037639, + "loss": 2.3909, + "theoretical_loss": 3.312337584424099, + "tokens_seen": 3101097984 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003244225834046193, + "loss": 2.647, + "theoretical_loss": 3.3123322385716767, + "tokens_seen": 3101163520 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003243156544054748, + "loss": 2.4831, + "theoretical_loss": 3.312326892863857, + "tokens_seen": 3101229056 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032420872540633026, + "loss": 2.5188, + "theoretical_loss": 3.312321547300633, + "tokens_seen": 3101294592 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003241017964071856, + "loss": 2.6367, + "theoretical_loss": 3.3123162018819974, + "tokens_seen": 3101360128 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003239948674080411, + "loss": 2.8672, + "theoretical_loss": 3.3123108566079438, + "tokens_seen": 3101425664 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003238879384088965, + "loss": 2.643, + "theoretical_loss": 3.312305511478465, + "tokens_seen": 3101491200 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032378100940975193, + "loss": 2.5004, + "theoretical_loss": 3.3123001664935545, + "tokens_seen": 3101556736 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032367408041060735, + "loss": 2.7209, + "theoretical_loss": 3.312294821653204, + "tokens_seen": 3101622272 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003235671514114628, + "loss": 2.449, + "theoretical_loss": 3.3122894769574085, + "tokens_seen": 3101687808 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003234602224123182, + "loss": 2.6206, + "theoretical_loss": 3.312284132406159, + "tokens_seen": 3101753344 + }, + { + "epoch": 0.36, + "objective/train/docs_used": 1699609, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.963756799697876, + "objective/train/theoretical_loss": 3.3122787879994497, + "objective/train/tokens_used": 131870176, + "theoretical_loss": 3.3122787879994497, + "tokens_seen": 3101818880 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032335329341317366, + "loss": 2.7136, + "theoretical_loss": 3.3122787879994497, + "tokens_seen": 3101818880 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003232463644140291, + "loss": 2.5142, + "theoretical_loss": 3.312273443737274, + "tokens_seen": 3101884416 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003231394354148845, + "loss": 2.4583, + "theoretical_loss": 3.3122680996196237, + "tokens_seen": 3101949952 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032303250641573997, + "loss": 2.685, + "theoretical_loss": 3.3122627556464925, + "tokens_seen": 3102015488 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003229255774165954, + "loss": 2.6635, + "theoretical_loss": 3.312257411817874, + "tokens_seen": 3102081024 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032281864841745086, + "loss": 2.4668, + "theoretical_loss": 3.31225206813376, + "tokens_seen": 3102146560 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003227117194183062, + "loss": 2.7014, + "theoretical_loss": 3.3122467245941447, + "tokens_seen": 3102212096 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003226047904191617, + "loss": 2.6762, + "theoretical_loss": 3.3122413811990206, + "tokens_seen": 3102277632 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003224978614200171, + "loss": 2.5899, + "theoretical_loss": 3.312236037948381, + "tokens_seen": 3102343168 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032239093242087254, + "loss": 2.6213, + "theoretical_loss": 3.312230694842218, + "tokens_seen": 3102408704 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032228400342172795, + "loss": 2.5883, + "theoretical_loss": 3.312225351880526, + "tokens_seen": 3102474240 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032217707442258343, + "loss": 2.4601, + "theoretical_loss": 3.312220009063297, + "tokens_seen": 3102539776 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032207014542343885, + "loss": 2.7531, + "theoretical_loss": 3.312214666390525, + "tokens_seen": 3102605312 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032196321642429426, + "loss": 2.7529, + "theoretical_loss": 3.312209323862202, + "tokens_seen": 3102670848 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032185628742514974, + "loss": 2.6192, + "theoretical_loss": 3.312203981478322, + "tokens_seen": 3102736384 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003217493584260051, + "loss": 2.7074, + "theoretical_loss": 3.3121986392388774, + "tokens_seen": 3102801920 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003216424294268606, + "loss": 2.824, + "theoretical_loss": 3.3121932971438612, + "tokens_seen": 3102867456 + }, + { + "epoch": 0.36, + "learning_rate": 0.000321535500427716, + "loss": 2.7669, + "theoretical_loss": 3.312187955193267, + "tokens_seen": 3102932992 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032142857142857147, + "loss": 2.7203, + "theoretical_loss": 3.3121826133870873, + "tokens_seen": 3102998528 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032132164242942683, + "loss": 2.895, + "theoretical_loss": 3.3121772717253157, + "tokens_seen": 3103064064 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003212147134302823, + "loss": 2.4758, + "theoretical_loss": 3.3121719302079446, + "tokens_seen": 3103129600 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003211077844311378, + "loss": 2.7221, + "theoretical_loss": 3.312166588834968, + "tokens_seen": 3103195136 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032100085543199314, + "loss": 2.6732, + "theoretical_loss": 3.3121612476063778, + "tokens_seen": 3103260672 + }, + { + "epoch": 0.36, + "learning_rate": 0.0003208939264328486, + "loss": 2.8059, + "theoretical_loss": 3.3121559065221673, + "tokens_seen": 3103326208 + }, + { + "epoch": 0.36, + "learning_rate": 0.00032078699743370403, + "loss": 2.542, + "theoretical_loss": 3.3121505655823302, + "tokens_seen": 3103391744 + }, + { + "epoch": 0.36, + "objective/train/docs_used": 1699977, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.559115171432495, + "objective/train/theoretical_loss": 3.312145224786859, + "objective/train/tokens_used": 133508576, + "theoretical_loss": 3.312145224786859, + "tokens_seen": 3103457280 + }, + { + "epoch": 0.37, + "learning_rate": 0.00032068006843455945, + "loss": 2.5754, + "theoretical_loss": 3.312145224786859, + "tokens_seen": 3103457280 + }, + { + "epoch": 0.37, + "learning_rate": 0.00032057313943541487, + "loss": 2.724, + "theoretical_loss": 3.312139884135747, + "tokens_seen": 3103522816 + }, + { + "epoch": 0.37, + "learning_rate": 0.00032046621043627034, + "loss": 2.6041, + "theoretical_loss": 3.312134543628987, + "tokens_seen": 3103588352 + }, + { + "epoch": 0.37, + "learning_rate": 0.00032035928143712576, + "loss": 2.5418, + "theoretical_loss": 3.3121292032665726, + "tokens_seen": 3103653888 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003202523524379812, + "loss": 2.4421, + "theoretical_loss": 3.3121238630484964, + "tokens_seen": 3103719424 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003201454234388366, + "loss": 2.6773, + "theoretical_loss": 3.3121185229747514, + "tokens_seen": 3103784960 + }, + { + "epoch": 0.37, + "learning_rate": 0.00032003849443969207, + "loss": 2.7928, + "theoretical_loss": 3.3121131830453305, + "tokens_seen": 3103850496 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003199315654405475, + "loss": 2.6714, + "theoretical_loss": 3.312107843260227, + "tokens_seen": 3103916032 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003198246364414029, + "loss": 2.5834, + "theoretical_loss": 3.3121025036194345, + "tokens_seen": 3103981568 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003197177074422584, + "loss": 2.7835, + "theoretical_loss": 3.312097164122945, + "tokens_seen": 3104047104 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031961077844311375, + "loss": 2.5629, + "theoretical_loss": 3.3120918247707527, + "tokens_seen": 3104112640 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003195038494439692, + "loss": 2.5316, + "theoretical_loss": 3.3120864855628493, + "tokens_seen": 3104178176 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031939692044482464, + "loss": 2.5927, + "theoretical_loss": 3.3120811464992292, + "tokens_seen": 3104243712 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031928999144568006, + "loss": 2.6749, + "theoretical_loss": 3.3120758075798844, + "tokens_seen": 3104309248 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003191830624465355, + "loss": 2.5939, + "theoretical_loss": 3.3120704688048086, + "tokens_seen": 3104374784 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031907613344739095, + "loss": 2.5078, + "theoretical_loss": 3.3120651301739947, + "tokens_seen": 3104440320 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003189692044482464, + "loss": 2.5793, + "theoretical_loss": 3.3120597916874357, + "tokens_seen": 3104505856 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003188622754491018, + "loss": 2.7482, + "theoretical_loss": 3.312054453345125, + "tokens_seen": 3104571392 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031875534644995726, + "loss": 2.7607, + "theoretical_loss": 3.3120491151470546, + "tokens_seen": 3104636928 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003186484174508127, + "loss": 2.7111, + "theoretical_loss": 3.3120437770932183, + "tokens_seen": 3104702464 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003185414884516681, + "loss": 2.5432, + "theoretical_loss": 3.3120384391836097, + "tokens_seen": 3104768000 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003184345594525235, + "loss": 2.664, + "theoretical_loss": 3.312033101418221, + "tokens_seen": 3104833536 + }, + { + "epoch": 0.37, + "learning_rate": 0.000318327630453379, + "loss": 2.6097, + "theoretical_loss": 3.3120277637970457, + "tokens_seen": 3104899072 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031822070145423435, + "loss": 2.7661, + "theoretical_loss": 3.3120224263200764, + "tokens_seen": 3104964608 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003181137724550898, + "loss": 2.7267, + "theoretical_loss": 3.312017088987307, + "tokens_seen": 3105030144 + }, + { + "epoch": 0.37, + "objective/train/docs_used": 1701277, + "objective/train/instantaneous_batch_size": 16, + "objective/train/instantaneous_microbatch_size": 16384, + "objective/train/original_loss": 2.1794204711914062, + "objective/train/theoretical_loss": 3.3120117517987295, + "objective/train/tokens_used": 135146976, + "theoretical_loss": 3.3120117517987295, + "tokens_seen": 3105095680 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003180068434559453, + "loss": 2.545, + "theoretical_loss": 3.3120117517987295, + "tokens_seen": 3105095680 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031789991445680066, + "loss": 2.6542, + "theoretical_loss": 3.3120064147543378, + "tokens_seen": 3105161216 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031779298545765613, + "loss": 2.6784, + "theoretical_loss": 3.312001077854125, + "tokens_seen": 3105226752 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031768605645851155, + "loss": 2.6771, + "theoretical_loss": 3.311995741098083, + "tokens_seen": 3105292288 + }, + { + "epoch": 0.37, + "learning_rate": 0.000317579127459367, + "loss": 2.5683, + "theoretical_loss": 3.311990404486206, + "tokens_seen": 3105357824 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003174721984602224, + "loss": 2.7383, + "theoretical_loss": 3.3119850680184872, + "tokens_seen": 3105423360 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031736526946107786, + "loss": 2.4683, + "theoretical_loss": 3.311979731694919, + "tokens_seen": 3105488896 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003172583404619333, + "loss": 2.5764, + "theoretical_loss": 3.3119743955154943, + "tokens_seen": 3105554432 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003171514114627887, + "loss": 2.6422, + "theoretical_loss": 3.311969059480207, + "tokens_seen": 3105619968 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031704448246364417, + "loss": 2.5915, + "theoretical_loss": 3.3119637235890496, + "tokens_seen": 3105685504 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003169375534644996, + "loss": 2.6249, + "theoretical_loss": 3.311958387842015, + "tokens_seen": 3105751040 + }, + { + "epoch": 0.37, + "learning_rate": 0.000316830624465355, + "loss": 2.6656, + "theoretical_loss": 3.311953052239097, + "tokens_seen": 3105816576 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031672369546621043, + "loss": 2.7245, + "theoretical_loss": 3.311947716780288, + "tokens_seen": 3105882112 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003166167664670659, + "loss": 2.5704, + "theoretical_loss": 3.311942381465581, + "tokens_seen": 3105947648 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031650983746792127, + "loss": 2.6325, + "theoretical_loss": 3.31193704629497, + "tokens_seen": 3106013184 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031640290846877674, + "loss": 2.6998, + "theoretical_loss": 3.3119317112684468, + "tokens_seen": 3106078720 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031629597946963216, + "loss": 2.8155, + "theoretical_loss": 3.311926376386005, + "tokens_seen": 3106144256 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031618905047048763, + "loss": 2.8693, + "theoretical_loss": 3.311921041647638, + "tokens_seen": 3106209792 + }, + { + "epoch": 0.37, + "learning_rate": 0.000316082121471343, + "loss": 2.6014, + "theoretical_loss": 3.311915707053339, + "tokens_seen": 3106275328 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031597519247219847, + "loss": 2.5689, + "theoretical_loss": 3.3119103726031005, + "tokens_seen": 3106340864 + }, + { + "epoch": 0.37, + "learning_rate": 0.00031586826347305394, + "loss": 2.6828, + "theoretical_loss": 3.3119050382969153, + "tokens_seen": 3106406400 + }, + { + "epoch": 0.37, + "learning_rate": 0.0003157613344739093, + "loss": 2.6957, + "theoretical_loss": 3.3118997041347775, + "tokens_seen": 3106471936 + } + ], + "max_steps": 4724, + "num_train_epochs": 9223372036854775807, + "total_flos": 5.9231768150016e+16, + "trial_name": null, + "trial_params": null +}