{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1606689246401355, "global_step": 759, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0416666666666666e-05, "loss": 3.1158, "theoretical_loss": 3.321573280713233, "tokens_seen": 2990473216 }, { "epoch": 0.0, "learning_rate": 2.0833333333333333e-05, "loss": 2.8674, "theoretical_loss": 3.321567680436603, "tokens_seen": 2990538752 }, { "epoch": 0.0, "learning_rate": 3.125e-05, "loss": 3.1083, "theoretical_loss": 3.321562080317061, "tokens_seen": 2990604288 }, { "epoch": 0.0, "learning_rate": 4.1666666666666665e-05, "loss": 2.8175, "theoretical_loss": 3.3215564803546, "tokens_seen": 2990669824 }, { "epoch": 0.0, "learning_rate": 5.208333333333334e-05, "loss": 2.8746, "theoretical_loss": 3.321550880549211, "tokens_seen": 2990735360 }, { "epoch": 0.0, "learning_rate": 6.25e-05, "loss": 2.6234, "theoretical_loss": 3.321545280900887, "tokens_seen": 2990800896 }, { "epoch": 0.0, "learning_rate": 7.291666666666667e-05, "loss": 2.6986, "theoretical_loss": 3.32153968140962, "tokens_seen": 2990866432 }, { "epoch": 0.0, "learning_rate": 8.333333333333333e-05, "loss": 2.9684, "theoretical_loss": 3.3215340820754022, "tokens_seen": 2990931968 }, { "epoch": 0.0, "learning_rate": 9.375e-05, "loss": 3.0289, "theoretical_loss": 3.321528482898225, "tokens_seen": 2990997504 }, { "epoch": 0.0, "learning_rate": 0.00010416666666666667, "loss": 2.5923, "theoretical_loss": 3.3215228838780817, "tokens_seen": 2991063040 }, { "epoch": 0.0, "learning_rate": 0.00011458333333333333, "loss": 2.7436, "theoretical_loss": 3.3215172850149637, "tokens_seen": 2991128576 }, { "epoch": 0.0, "learning_rate": 0.000125, "loss": 2.7711, "theoretical_loss": 3.3215116863088636, "tokens_seen": 2991194112 }, { "epoch": 0.0, "learning_rate": 0.00013541666666666666, "loss": 2.4174, "theoretical_loss": 3.3215060877597735, "tokens_seen": 2991259648 }, { "epoch": 0.0, "learning_rate": 0.00014583333333333335, "loss": 2.5628, "theoretical_loss": 3.3215004893676854, "tokens_seen": 2991325184 }, { "epoch": 0.0, "learning_rate": 0.00015625, "loss": 2.5322, "theoretical_loss": 3.3214948911325908, "tokens_seen": 2991390720 }, { "epoch": 0.0, "learning_rate": 0.00016666666666666666, "loss": 2.5703, "theoretical_loss": 3.321489293054483, "tokens_seen": 2991456256 }, { "epoch": 0.0, "learning_rate": 0.00017708333333333335, "loss": 2.4464, "theoretical_loss": 3.3214836951333537, "tokens_seen": 2991521792 }, { "epoch": 0.0, "learning_rate": 0.0001875, "loss": 2.5979, "theoretical_loss": 3.321478097369195, "tokens_seen": 2991587328 }, { "epoch": 0.0, "learning_rate": 0.00019791666666666666, "loss": 2.6103, "theoretical_loss": 3.321472499761999, "tokens_seen": 2991652864 }, { "epoch": 0.0, "learning_rate": 0.00020833333333333335, "loss": 2.5729, "theoretical_loss": 3.321466902311758, "tokens_seen": 2991718400 }, { "epoch": 0.0, "learning_rate": 0.00021875, "loss": 2.3812, "theoretical_loss": 3.321461305018464, "tokens_seen": 2991783936 }, { "epoch": 0.0, "learning_rate": 0.00022916666666666666, "loss": 2.461, "theoretical_loss": 3.3214557078821096, "tokens_seen": 2991849472 }, { "epoch": 0.0, "learning_rate": 0.00023958333333333335, "loss": 2.3162, "theoretical_loss": 3.3214501109026866, "tokens_seen": 2991915008 }, { "epoch": 0.01, "learning_rate": 0.00025, "loss": 2.4302, "theoretical_loss": 3.321444514080187, "tokens_seen": 2991980544 }, { "epoch": 0.01, "objective/train/docs_used": 1640731, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.594472885131836, "objective/train/theoretical_loss": 3.321438917414603, "objective/train/tokens_used": 22097376, "theoretical_loss": 3.321438917414603, "tokens_seen": 2992046080 }, { "epoch": 0.01, "learning_rate": 0.0002604166666666667, "loss": 2.6341, "theoretical_loss": 3.321438917414603, "tokens_seen": 2992046080 }, { "epoch": 0.01, "learning_rate": 0.0002708333333333333, "loss": 2.6529, "theoretical_loss": 3.321433320905927, "tokens_seen": 2992111616 }, { "epoch": 0.01, "learning_rate": 0.00028125000000000003, "loss": 2.6057, "theoretical_loss": 3.3214277245541513, "tokens_seen": 2992177152 }, { "epoch": 0.01, "learning_rate": 0.0002916666666666667, "loss": 2.6216, "theoretical_loss": 3.3214221283592678, "tokens_seen": 2992242688 }, { "epoch": 0.01, "learning_rate": 0.0003020833333333333, "loss": 2.4586, "theoretical_loss": 3.321416532321269, "tokens_seen": 2992308224 }, { "epoch": 0.01, "learning_rate": 0.0003125, "loss": 2.4143, "theoretical_loss": 3.321410936440146, "tokens_seen": 2992373760 }, { "epoch": 0.01, "learning_rate": 0.0003229166666666667, "loss": 2.6421, "theoretical_loss": 3.3214053407158923, "tokens_seen": 2992439296 }, { "epoch": 0.01, "learning_rate": 0.0003333333333333333, "loss": 2.4524, "theoretical_loss": 3.3213997451485, "tokens_seen": 2992504832 }, { "epoch": 0.01, "learning_rate": 0.00034375, "loss": 2.4758, "theoretical_loss": 3.32139414973796, "tokens_seen": 2992570368 }, { "epoch": 0.01, "learning_rate": 0.0003541666666666667, "loss": 2.4524, "theoretical_loss": 3.3213885544842654, "tokens_seen": 2992635904 }, { "epoch": 0.01, "learning_rate": 0.0003645833333333333, "loss": 2.7206, "theoretical_loss": 3.3213829593874085, "tokens_seen": 2992701440 }, { "epoch": 0.01, "learning_rate": 0.000375, "loss": 2.5027, "theoretical_loss": 3.321377364447381, "tokens_seen": 2992766976 }, { "epoch": 0.01, "learning_rate": 0.0003854166666666667, "loss": 2.6288, "theoretical_loss": 3.321371769664175, "tokens_seen": 2992832512 }, { "epoch": 0.01, "learning_rate": 0.0003958333333333333, "loss": 2.6262, "theoretical_loss": 3.3213661750377836, "tokens_seen": 2992898048 }, { "epoch": 0.01, "learning_rate": 0.00040625000000000004, "loss": 2.4324, "theoretical_loss": 3.321360580568198, "tokens_seen": 2992963584 }, { "epoch": 0.01, "learning_rate": 0.0004166666666666667, "loss": 2.3967, "theoretical_loss": 3.3213549862554106, "tokens_seen": 2993029120 }, { "epoch": 0.01, "learning_rate": 0.0004270833333333333, "loss": 2.4334, "theoretical_loss": 3.321349392099414, "tokens_seen": 2993094656 }, { "epoch": 0.01, "learning_rate": 0.0004375, "loss": 2.5264, "theoretical_loss": 3.3213437981001994, "tokens_seen": 2993160192 }, { "epoch": 0.01, "learning_rate": 0.0004479166666666667, "loss": 2.3953, "theoretical_loss": 3.32133820425776, "tokens_seen": 2993225728 }, { "epoch": 0.01, "learning_rate": 0.0004583333333333333, "loss": 2.5172, "theoretical_loss": 3.3213326105720875, "tokens_seen": 2993291264 }, { "epoch": 0.01, "learning_rate": 0.00046875, "loss": 2.6647, "theoretical_loss": 3.321327017043174, "tokens_seen": 2993356800 }, { "epoch": 0.01, "learning_rate": 0.0004791666666666667, "loss": 2.4897, "theoretical_loss": 3.3213214236710122, "tokens_seen": 2993422336 }, { "epoch": 0.01, "learning_rate": 0.0004895833333333333, "loss": 2.5138, "theoretical_loss": 3.321315830455594, "tokens_seen": 2993487872 }, { "epoch": 0.01, "learning_rate": 0.0005, "loss": 2.372, "theoretical_loss": 3.321310237396911, "tokens_seen": 2993553408 }, { "epoch": 0.01, "learning_rate": 0.0004998930710008554, "loss": 2.3235, "theoretical_loss": 3.321304644494956, "tokens_seen": 2993618944 }, { "epoch": 0.01, "objective/train/docs_used": 1641796, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.1646158695220947, "objective/train/theoretical_loss": 3.3212990517497207, "objective/train/tokens_used": 23735776, "theoretical_loss": 3.3212990517497207, "tokens_seen": 2993684480 }, { "epoch": 0.01, "learning_rate": 0.0004997861420017108, "loss": 2.7205, "theoretical_loss": 3.3212990517497207, "tokens_seen": 2993684480 }, { "epoch": 0.01, "learning_rate": 0.0004996792130025663, "loss": 2.6519, "theoretical_loss": 3.3212934591611982, "tokens_seen": 2993750016 }, { "epoch": 0.01, "learning_rate": 0.0004995722840034218, "loss": 2.5422, "theoretical_loss": 3.3212878667293797, "tokens_seen": 2993815552 }, { "epoch": 0.01, "learning_rate": 0.0004994653550042771, "loss": 2.6428, "theoretical_loss": 3.321282274454258, "tokens_seen": 2993881088 }, { "epoch": 0.01, "learning_rate": 0.0004993584260051326, "loss": 2.5495, "theoretical_loss": 3.321276682335825, "tokens_seen": 2993946624 }, { "epoch": 0.01, "learning_rate": 0.000499251497005988, "loss": 2.5962, "theoretical_loss": 3.321271090374073, "tokens_seen": 2994012160 }, { "epoch": 0.01, "learning_rate": 0.0004991445680068435, "loss": 2.5919, "theoretical_loss": 3.3212654985689936, "tokens_seen": 2994077696 }, { "epoch": 0.01, "learning_rate": 0.0004990376390076989, "loss": 2.5942, "theoretical_loss": 3.32125990692058, "tokens_seen": 2994143232 }, { "epoch": 0.01, "learning_rate": 0.0004989307100085543, "loss": 2.2569, "theoretical_loss": 3.3212543154288237, "tokens_seen": 2994208768 }, { "epoch": 0.01, "learning_rate": 0.0004988237810094098, "loss": 2.446, "theoretical_loss": 3.321248724093717, "tokens_seen": 2994274304 }, { "epoch": 0.01, "learning_rate": 0.0004987168520102651, "loss": 2.5982, "theoretical_loss": 3.3212431329152525, "tokens_seen": 2994339840 }, { "epoch": 0.01, "learning_rate": 0.0004986099230111207, "loss": 2.3817, "theoretical_loss": 3.3212375418934217, "tokens_seen": 2994405376 }, { "epoch": 0.01, "learning_rate": 0.0004985029940119761, "loss": 2.5162, "theoretical_loss": 3.321231951028217, "tokens_seen": 2994470912 }, { "epoch": 0.01, "learning_rate": 0.0004983960650128315, "loss": 2.7158, "theoretical_loss": 3.321226360319631, "tokens_seen": 2994536448 }, { "epoch": 0.01, "learning_rate": 0.0004982891360136869, "loss": 2.555, "theoretical_loss": 3.3212207697676552, "tokens_seen": 2994601984 }, { "epoch": 0.01, "learning_rate": 0.0004981822070145423, "loss": 2.2944, "theoretical_loss": 3.3212151793722824, "tokens_seen": 2994667520 }, { "epoch": 0.01, "learning_rate": 0.0004980752780153978, "loss": 2.4699, "theoretical_loss": 3.3212095891335043, "tokens_seen": 2994733056 }, { "epoch": 0.01, "learning_rate": 0.0004979683490162532, "loss": 2.5024, "theoretical_loss": 3.321203999051314, "tokens_seen": 2994798592 }, { "epoch": 0.01, "learning_rate": 0.0004978614200171087, "loss": 2.5346, "theoretical_loss": 3.321198409125702, "tokens_seen": 2994864128 }, { "epoch": 0.01, "learning_rate": 0.0004977544910179641, "loss": 2.7699, "theoretical_loss": 3.321192819356662, "tokens_seen": 2994929664 }, { "epoch": 0.01, "learning_rate": 0.0004976475620188195, "loss": 2.3356, "theoretical_loss": 3.321187229744186, "tokens_seen": 2994995200 }, { "epoch": 0.02, "learning_rate": 0.000497540633019675, "loss": 2.4074, "theoretical_loss": 3.3211816402882652, "tokens_seen": 2995060736 }, { "epoch": 0.02, "learning_rate": 0.0004974337040205304, "loss": 2.6453, "theoretical_loss": 3.321176050988893, "tokens_seen": 2995126272 }, { "epoch": 0.02, "learning_rate": 0.0004973267750213858, "loss": 2.5405, "theoretical_loss": 3.3211704618460614, "tokens_seen": 2995191808 }, { "epoch": 0.02, "learning_rate": 0.0004972198460222412, "loss": 2.645, "theoretical_loss": 3.3211648728597614, "tokens_seen": 2995257344 }, { "epoch": 0.02, "objective/train/docs_used": 1642448, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.835965394973755, "objective/train/theoretical_loss": 3.3211592840299864, "objective/train/tokens_used": 25374176, "theoretical_loss": 3.3211592840299864, "tokens_seen": 2995322880 }, { "epoch": 0.02, "learning_rate": 0.0004971129170230966, "loss": 2.5601, "theoretical_loss": 3.3211592840299864, "tokens_seen": 2995322880 }, { "epoch": 0.02, "learning_rate": 0.0004970059880239521, "loss": 2.6971, "theoretical_loss": 3.3211536953567284, "tokens_seen": 2995388416 }, { "epoch": 0.02, "learning_rate": 0.0004968990590248076, "loss": 2.4257, "theoretical_loss": 3.321148106839979, "tokens_seen": 2995453952 }, { "epoch": 0.02, "learning_rate": 0.000496792130025663, "loss": 2.5419, "theoretical_loss": 3.321142518479731, "tokens_seen": 2995519488 }, { "epoch": 0.02, "learning_rate": 0.0004966852010265184, "loss": 2.7436, "theoretical_loss": 3.321136930275977, "tokens_seen": 2995585024 }, { "epoch": 0.02, "learning_rate": 0.0004965782720273738, "loss": 2.5704, "theoretical_loss": 3.321131342228708, "tokens_seen": 2995650560 }, { "epoch": 0.02, "learning_rate": 0.0004964713430282293, "loss": 2.8543, "theoretical_loss": 3.321125754337917, "tokens_seen": 2995716096 }, { "epoch": 0.02, "learning_rate": 0.0004963644140290847, "loss": 2.5733, "theoretical_loss": 3.321120166603596, "tokens_seen": 2995781632 }, { "epoch": 0.02, "learning_rate": 0.0004962574850299401, "loss": 2.5429, "theoretical_loss": 3.3211145790257373, "tokens_seen": 2995847168 }, { "epoch": 0.02, "learning_rate": 0.0004961505560307955, "loss": 2.7132, "theoretical_loss": 3.3211089916043326, "tokens_seen": 2995912704 }, { "epoch": 0.02, "learning_rate": 0.000496043627031651, "loss": 2.4883, "theoretical_loss": 3.3211034043393743, "tokens_seen": 2995978240 }, { "epoch": 0.02, "learning_rate": 0.0004959366980325064, "loss": 2.5549, "theoretical_loss": 3.3210978172308554, "tokens_seen": 2996043776 }, { "epoch": 0.02, "learning_rate": 0.0004958297690333619, "loss": 2.7343, "theoretical_loss": 3.3210922302787673, "tokens_seen": 2996109312 }, { "epoch": 0.02, "learning_rate": 0.0004957228400342173, "loss": 2.5603, "theoretical_loss": 3.3210866434831026, "tokens_seen": 2996174848 }, { "epoch": 0.02, "learning_rate": 0.0004956159110350727, "loss": 2.7516, "theoretical_loss": 3.3210810568438527, "tokens_seen": 2996240384 }, { "epoch": 0.02, "learning_rate": 0.0004955089820359281, "loss": 2.4148, "theoretical_loss": 3.3210754703610106, "tokens_seen": 2996305920 }, { "epoch": 0.02, "learning_rate": 0.0004954020530367836, "loss": 2.5289, "theoretical_loss": 3.3210698840345687, "tokens_seen": 2996371456 }, { "epoch": 0.02, "learning_rate": 0.0004952951240376391, "loss": 2.4535, "theoretical_loss": 3.321064297864518, "tokens_seen": 2996436992 }, { "epoch": 0.02, "learning_rate": 0.0004951881950384944, "loss": 2.3, "theoretical_loss": 3.3210587118508523, "tokens_seen": 2996502528 }, { "epoch": 0.02, "learning_rate": 0.0004950812660393499, "loss": 2.6429, "theoretical_loss": 3.3210531259935627, "tokens_seen": 2996568064 }, { "epoch": 0.02, "learning_rate": 0.0004949743370402053, "loss": 2.4731, "theoretical_loss": 3.3210475402926414, "tokens_seen": 2996633600 }, { "epoch": 0.02, "learning_rate": 0.0004948674080410608, "loss": 2.636, "theoretical_loss": 3.321041954748081, "tokens_seen": 2996699136 }, { "epoch": 0.02, "learning_rate": 0.0004947604790419162, "loss": 2.5385, "theoretical_loss": 3.3210363693598737, "tokens_seen": 2996764672 }, { "epoch": 0.02, "learning_rate": 0.0004946535500427716, "loss": 2.4859, "theoretical_loss": 3.321030784128012, "tokens_seen": 2996830208 }, { "epoch": 0.02, "learning_rate": 0.000494546621043627, "loss": 2.5137, "theoretical_loss": 3.321025199052487, "tokens_seen": 2996895744 }, { "epoch": 0.02, "objective/train/docs_used": 1643593, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.5586180686950684, "objective/train/theoretical_loss": 3.321019614133292, "objective/train/tokens_used": 27012576, "theoretical_loss": 3.321019614133292, "tokens_seen": 2996961280 }, { "epoch": 0.02, "learning_rate": 0.0004944396920444824, "loss": 2.5684, "theoretical_loss": 3.321019614133292, "tokens_seen": 2996961280 }, { "epoch": 0.02, "learning_rate": 0.000494332763045338, "loss": 2.5426, "theoretical_loss": 3.321014029370419, "tokens_seen": 2997026816 }, { "epoch": 0.02, "learning_rate": 0.0004942258340461933, "loss": 2.6556, "theoretical_loss": 3.3210084447638595, "tokens_seen": 2997092352 }, { "epoch": 0.02, "learning_rate": 0.0004941189050470488, "loss": 2.668, "theoretical_loss": 3.3210028603136066, "tokens_seen": 2997157888 }, { "epoch": 0.02, "learning_rate": 0.0004940119760479042, "loss": 2.544, "theoretical_loss": 3.320997276019652, "tokens_seen": 2997223424 }, { "epoch": 0.02, "learning_rate": 0.0004939050470487596, "loss": 2.4703, "theoretical_loss": 3.320991691881988, "tokens_seen": 2997288960 }, { "epoch": 0.02, "learning_rate": 0.000493798118049615, "loss": 2.4878, "theoretical_loss": 3.3209861079006067, "tokens_seen": 2997354496 }, { "epoch": 0.02, "learning_rate": 0.0004936911890504705, "loss": 2.5617, "theoretical_loss": 3.320980524075501, "tokens_seen": 2997420032 }, { "epoch": 0.02, "learning_rate": 0.000493584260051326, "loss": 2.6501, "theoretical_loss": 3.320974940406662, "tokens_seen": 2997485568 }, { "epoch": 0.02, "learning_rate": 0.0004934773310521813, "loss": 2.4307, "theoretical_loss": 3.3209693568940826, "tokens_seen": 2997551104 }, { "epoch": 0.02, "learning_rate": 0.0004933704020530368, "loss": 2.4372, "theoretical_loss": 3.320963773537755, "tokens_seen": 2997616640 }, { "epoch": 0.02, "learning_rate": 0.0004932634730538923, "loss": 2.5929, "theoretical_loss": 3.3209581903376715, "tokens_seen": 2997682176 }, { "epoch": 0.02, "learning_rate": 0.0004931565440547477, "loss": 2.3358, "theoretical_loss": 3.320952607293824, "tokens_seen": 2997747712 }, { "epoch": 0.02, "learning_rate": 0.0004930496150556031, "loss": 2.6659, "theoretical_loss": 3.3209470244062045, "tokens_seen": 2997813248 }, { "epoch": 0.02, "learning_rate": 0.0004929426860564585, "loss": 2.5873, "theoretical_loss": 3.320941441674806, "tokens_seen": 2997878784 }, { "epoch": 0.02, "learning_rate": 0.0004928357570573139, "loss": 2.5936, "theoretical_loss": 3.32093585909962, "tokens_seen": 2997944320 }, { "epoch": 0.02, "learning_rate": 0.0004927288280581693, "loss": 2.5437, "theoretical_loss": 3.320930276680639, "tokens_seen": 2998009856 }, { "epoch": 0.02, "learning_rate": 0.0004926218990590249, "loss": 2.5207, "theoretical_loss": 3.320924694417855, "tokens_seen": 2998075392 }, { "epoch": 0.02, "learning_rate": 0.0004925149700598803, "loss": 2.624, "theoretical_loss": 3.3209191123112607, "tokens_seen": 2998140928 }, { "epoch": 0.03, "learning_rate": 0.0004924080410607357, "loss": 2.6163, "theoretical_loss": 3.3209135303608477, "tokens_seen": 2998206464 }, { "epoch": 0.03, "learning_rate": 0.0004923011120615911, "loss": 2.502, "theoretical_loss": 3.320907948566609, "tokens_seen": 2998272000 }, { "epoch": 0.03, "learning_rate": 0.0004921941830624465, "loss": 2.4689, "theoretical_loss": 3.320902366928536, "tokens_seen": 2998337536 }, { "epoch": 0.03, "learning_rate": 0.000492087254063302, "loss": 2.7454, "theoretical_loss": 3.3208967854466214, "tokens_seen": 2998403072 }, { "epoch": 0.03, "learning_rate": 0.0004919803250641574, "loss": 2.601, "theoretical_loss": 3.320891204120857, "tokens_seen": 2998468608 }, { "epoch": 0.03, "learning_rate": 0.0004918733960650129, "loss": 2.2926, "theoretical_loss": 3.3208856229512356, "tokens_seen": 2998534144 }, { "epoch": 0.03, "objective/train/docs_used": 1644132, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.451066732406616, "objective/train/theoretical_loss": 3.320880041937749, "objective/train/tokens_used": 28650976, "theoretical_loss": 3.320880041937749, "tokens_seen": 2998599680 }, { "epoch": 0.03, "learning_rate": 0.0004917664670658682, "loss": 2.6377, "theoretical_loss": 3.320880041937749, "tokens_seen": 2998599680 }, { "epoch": 0.03, "learning_rate": 0.0004916595380667237, "loss": 2.5554, "theoretical_loss": 3.3208744610803898, "tokens_seen": 2998665216 }, { "epoch": 0.03, "learning_rate": 0.0004915526090675792, "loss": 2.6001, "theoretical_loss": 3.3208688803791495, "tokens_seen": 2998730752 }, { "epoch": 0.03, "learning_rate": 0.0004914456800684346, "loss": 2.5503, "theoretical_loss": 3.320863299834021, "tokens_seen": 2998796288 }, { "epoch": 0.03, "learning_rate": 0.00049133875106929, "loss": 2.5515, "theoretical_loss": 3.3208577194449966, "tokens_seen": 2998861824 }, { "epoch": 0.03, "learning_rate": 0.0004912318220701454, "loss": 2.5162, "theoretical_loss": 3.320852139212068, "tokens_seen": 2998927360 }, { "epoch": 0.03, "learning_rate": 0.0004911248930710008, "loss": 2.5311, "theoretical_loss": 3.3208465591352274, "tokens_seen": 2998992896 }, { "epoch": 0.03, "learning_rate": 0.0004910179640718563, "loss": 2.4144, "theoretical_loss": 3.3208409792144677, "tokens_seen": 2999058432 }, { "epoch": 0.03, "learning_rate": 0.0004909110350727117, "loss": 2.6587, "theoretical_loss": 3.3208353994497806, "tokens_seen": 2999123968 }, { "epoch": 0.03, "learning_rate": 0.0004908041060735672, "loss": 2.4581, "theoretical_loss": 3.320829819841158, "tokens_seen": 2999189504 }, { "epoch": 0.03, "learning_rate": 0.0004906971770744226, "loss": 2.5588, "theoretical_loss": 3.320824240388593, "tokens_seen": 2999255040 }, { "epoch": 0.03, "learning_rate": 0.000490590248075278, "loss": 2.4129, "theoretical_loss": 3.320818661092077, "tokens_seen": 2999320576 }, { "epoch": 0.03, "learning_rate": 0.0004904833190761335, "loss": 2.5726, "theoretical_loss": 3.320813081951603, "tokens_seen": 2999386112 }, { "epoch": 0.03, "learning_rate": 0.0004903763900769889, "loss": 2.6172, "theoretical_loss": 3.3208075029671624, "tokens_seen": 2999451648 }, { "epoch": 0.03, "learning_rate": 0.0004902694610778443, "loss": 2.6785, "theoretical_loss": 3.320801924138748, "tokens_seen": 2999517184 }, { "epoch": 0.03, "learning_rate": 0.0004901625320786997, "loss": 2.6485, "theoretical_loss": 3.320796345466352, "tokens_seen": 2999582720 }, { "epoch": 0.03, "learning_rate": 0.0004900556030795552, "loss": 2.5381, "theoretical_loss": 3.3207907669499663, "tokens_seen": 2999648256 }, { "epoch": 0.03, "learning_rate": 0.0004899486740804106, "loss": 2.5339, "theoretical_loss": 3.320785188589584, "tokens_seen": 2999713792 }, { "epoch": 0.03, "learning_rate": 0.0004898417450812661, "loss": 2.6513, "theoretical_loss": 3.320779610385196, "tokens_seen": 2999779328 }, { "epoch": 0.03, "learning_rate": 0.0004897348160821215, "loss": 2.6681, "theoretical_loss": 3.3207740323367956, "tokens_seen": 2999844864 }, { "epoch": 0.03, "learning_rate": 0.0004896278870829769, "loss": 2.6658, "theoretical_loss": 3.320768454444374, "tokens_seen": 2999910400 }, { "epoch": 0.03, "learning_rate": 0.0004895209580838323, "loss": 2.6133, "theoretical_loss": 3.3207628767079242, "tokens_seen": 2999975936 }, { "epoch": 0.03, "learning_rate": 0.0004894140290846878, "loss": 2.3937, "theoretical_loss": 3.3207572991274388, "tokens_seen": 3000041472 }, { "epoch": 0.03, "learning_rate": 0.0004893071000855432, "loss": 2.433, "theoretical_loss": 3.3207517217029094, "tokens_seen": 3000107008 }, { "epoch": 0.03, "learning_rate": 0.0004892001710863986, "loss": 2.5588, "theoretical_loss": 3.3207461444343283, "tokens_seen": 3000172544 }, { "epoch": 0.03, "objective/train/docs_used": 1645467, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7996082305908203, "objective/train/theoretical_loss": 3.3207405673216877, "objective/train/tokens_used": 30289376, "theoretical_loss": 3.3207405673216877, "tokens_seen": 3000238080 }, { "epoch": 0.03, "learning_rate": 0.0004890932420872541, "loss": 2.7605, "theoretical_loss": 3.3207405673216877, "tokens_seen": 3000238080 }, { "epoch": 0.03, "learning_rate": 0.0004889863130881094, "loss": 2.6024, "theoretical_loss": 3.3207349903649797, "tokens_seen": 3000303616 }, { "epoch": 0.03, "learning_rate": 0.000488879384088965, "loss": 2.4747, "theoretical_loss": 3.320729413564197, "tokens_seen": 3000369152 }, { "epoch": 0.03, "learning_rate": 0.0004887724550898204, "loss": 2.5954, "theoretical_loss": 3.3207238369193313, "tokens_seen": 3000434688 }, { "epoch": 0.03, "learning_rate": 0.0004886655260906758, "loss": 2.6027, "theoretical_loss": 3.3207182604303753, "tokens_seen": 3000500224 }, { "epoch": 0.03, "learning_rate": 0.0004885585970915312, "loss": 2.6733, "theoretical_loss": 3.3207126840973213, "tokens_seen": 3000565760 }, { "epoch": 0.03, "learning_rate": 0.0004884516680923866, "loss": 2.5098, "theoretical_loss": 3.320707107920161, "tokens_seen": 3000631296 }, { "epoch": 0.03, "learning_rate": 0.0004883447390932422, "loss": 2.4302, "theoretical_loss": 3.3207015318988873, "tokens_seen": 3000696832 }, { "epoch": 0.03, "learning_rate": 0.0004882378100940975, "loss": 2.5576, "theoretical_loss": 3.3206959560334917, "tokens_seen": 3000762368 }, { "epoch": 0.03, "learning_rate": 0.00048813088109495294, "loss": 2.4798, "theoretical_loss": 3.320690380323967, "tokens_seen": 3000827904 }, { "epoch": 0.03, "learning_rate": 0.0004880239520958084, "loss": 2.3385, "theoretical_loss": 3.320684804770305, "tokens_seen": 3000893440 }, { "epoch": 0.03, "learning_rate": 0.0004879170230966638, "loss": 2.4282, "theoretical_loss": 3.3206792293724985, "tokens_seen": 3000958976 }, { "epoch": 0.03, "learning_rate": 0.00048781009409751925, "loss": 2.5659, "theoretical_loss": 3.3206736541305393, "tokens_seen": 3001024512 }, { "epoch": 0.03, "learning_rate": 0.0004877031650983747, "loss": 2.7023, "theoretical_loss": 3.3206680790444194, "tokens_seen": 3001090048 }, { "epoch": 0.03, "learning_rate": 0.0004875962360992301, "loss": 2.4634, "theoretical_loss": 3.3206625041141318, "tokens_seen": 3001155584 }, { "epoch": 0.03, "learning_rate": 0.00048748930710008556, "loss": 2.7005, "theoretical_loss": 3.3206569293396684, "tokens_seen": 3001221120 }, { "epoch": 0.04, "learning_rate": 0.000487382378100941, "loss": 2.6176, "theoretical_loss": 3.3206513547210212, "tokens_seen": 3001286656 }, { "epoch": 0.04, "learning_rate": 0.00048727544910179645, "loss": 2.4467, "theoretical_loss": 3.320645780258183, "tokens_seen": 3001352192 }, { "epoch": 0.04, "learning_rate": 0.0004871685201026518, "loss": 2.5512, "theoretical_loss": 3.320640205951145, "tokens_seen": 3001417728 }, { "epoch": 0.04, "learning_rate": 0.0004870615911035073, "loss": 2.5798, "theoretical_loss": 3.3206346317999005, "tokens_seen": 3001483264 }, { "epoch": 0.04, "learning_rate": 0.0004869546621043627, "loss": 2.6427, "theoretical_loss": 3.3206290578044415, "tokens_seen": 3001548800 }, { "epoch": 0.04, "learning_rate": 0.00048684773310521813, "loss": 2.596, "theoretical_loss": 3.32062348396476, "tokens_seen": 3001614336 }, { "epoch": 0.04, "learning_rate": 0.0004867408041060736, "loss": 2.6275, "theoretical_loss": 3.3206179102808484, "tokens_seen": 3001679872 }, { "epoch": 0.04, "learning_rate": 0.000486633875106929, "loss": 2.6446, "theoretical_loss": 3.3206123367526987, "tokens_seen": 3001745408 }, { "epoch": 0.04, "learning_rate": 0.00048652694610778444, "loss": 2.248, "theoretical_loss": 3.3206067633803036, "tokens_seen": 3001810944 }, { "epoch": 0.04, "objective/train/docs_used": 1646713, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.3031907081604004, "objective/train/theoretical_loss": 3.320601190163655, "objective/train/tokens_used": 31927776, "theoretical_loss": 3.320601190163655, "tokens_seen": 3001876480 }, { "epoch": 0.04, "learning_rate": 0.00048642001710863986, "loss": 2.5457, "theoretical_loss": 3.320601190163655, "tokens_seen": 3001876480 }, { "epoch": 0.04, "learning_rate": 0.00048631308810949533, "loss": 2.6029, "theoretical_loss": 3.320595617102745, "tokens_seen": 3001942016 }, { "epoch": 0.04, "learning_rate": 0.0004862061591103507, "loss": 2.4349, "theoretical_loss": 3.320590044197566, "tokens_seen": 3002007552 }, { "epoch": 0.04, "learning_rate": 0.00048609923011120617, "loss": 2.6641, "theoretical_loss": 3.320584471448111, "tokens_seen": 3002073088 }, { "epoch": 0.04, "learning_rate": 0.0004859923011120616, "loss": 2.6479, "theoretical_loss": 3.320578898854371, "tokens_seen": 3002138624 }, { "epoch": 0.04, "learning_rate": 0.00048588537211291706, "loss": 2.6043, "theoretical_loss": 3.3205733264163393, "tokens_seen": 3002204160 }, { "epoch": 0.04, "learning_rate": 0.0004857784431137724, "loss": 2.5969, "theoretical_loss": 3.3205677541340073, "tokens_seen": 3002269696 }, { "epoch": 0.04, "learning_rate": 0.0004856715141146279, "loss": 2.5909, "theoretical_loss": 3.320562182007368, "tokens_seen": 3002335232 }, { "epoch": 0.04, "learning_rate": 0.00048556458511548337, "loss": 2.6341, "theoretical_loss": 3.3205566100364132, "tokens_seen": 3002400768 }, { "epoch": 0.04, "learning_rate": 0.00048545765611633873, "loss": 2.5535, "theoretical_loss": 3.320551038221135, "tokens_seen": 3002466304 }, { "epoch": 0.04, "learning_rate": 0.0004853507271171942, "loss": 2.6686, "theoretical_loss": 3.320545466561526, "tokens_seen": 3002531840 }, { "epoch": 0.04, "learning_rate": 0.0004852437981180496, "loss": 2.4385, "theoretical_loss": 3.3205398950575784, "tokens_seen": 3002597376 }, { "epoch": 0.04, "learning_rate": 0.00048513686911890504, "loss": 2.2543, "theoretical_loss": 3.3205343237092846, "tokens_seen": 3002662912 }, { "epoch": 0.04, "learning_rate": 0.00048502994011976046, "loss": 2.2638, "theoretical_loss": 3.320528752516636, "tokens_seen": 3002728448 }, { "epoch": 0.04, "learning_rate": 0.00048492301112061594, "loss": 2.6381, "theoretical_loss": 3.320523181479626, "tokens_seen": 3002793984 }, { "epoch": 0.04, "learning_rate": 0.00048481608212147135, "loss": 2.3925, "theoretical_loss": 3.3205176105982463, "tokens_seen": 3002859520 }, { "epoch": 0.04, "learning_rate": 0.00048470915312232677, "loss": 2.4351, "theoretical_loss": 3.320512039872489, "tokens_seen": 3002925056 }, { "epoch": 0.04, "learning_rate": 0.00048460222412318225, "loss": 2.5748, "theoretical_loss": 3.320506469302347, "tokens_seen": 3002990592 }, { "epoch": 0.04, "learning_rate": 0.00048449529512403766, "loss": 2.6547, "theoretical_loss": 3.320500898887812, "tokens_seen": 3003056128 }, { "epoch": 0.04, "learning_rate": 0.0004843883661248931, "loss": 2.5674, "theoretical_loss": 3.3204953286288763, "tokens_seen": 3003121664 }, { "epoch": 0.04, "learning_rate": 0.0004842814371257485, "loss": 2.8503, "theoretical_loss": 3.3204897585255324, "tokens_seen": 3003187200 }, { "epoch": 0.04, "learning_rate": 0.000484174508126604, "loss": 2.3909, "theoretical_loss": 3.3204841885777725, "tokens_seen": 3003252736 }, { "epoch": 0.04, "learning_rate": 0.00048406757912745934, "loss": 2.6026, "theoretical_loss": 3.3204786187855886, "tokens_seen": 3003318272 }, { "epoch": 0.04, "learning_rate": 0.0004839606501283148, "loss": 2.3427, "theoretical_loss": 3.3204730491489727, "tokens_seen": 3003383808 }, { "epoch": 0.04, "learning_rate": 0.00048385372112917023, "loss": 2.5079, "theoretical_loss": 3.320467479667918, "tokens_seen": 3003449344 }, { "epoch": 0.04, "objective/train/docs_used": 1647443, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.66249418258667, "objective/train/theoretical_loss": 3.3204619103424164, "objective/train/tokens_used": 33566176, "theoretical_loss": 3.3204619103424164, "tokens_seen": 3003514880 }, { "epoch": 0.04, "learning_rate": 0.00048374679213002565, "loss": 2.5391, "theoretical_loss": 3.3204619103424164, "tokens_seen": 3003514880 }, { "epoch": 0.04, "learning_rate": 0.0004836398631308811, "loss": 2.5985, "theoretical_loss": 3.3204563411724597, "tokens_seen": 3003580416 }, { "epoch": 0.04, "learning_rate": 0.00048353293413173654, "loss": 2.5948, "theoretical_loss": 3.3204507721580403, "tokens_seen": 3003645952 }, { "epoch": 0.04, "learning_rate": 0.000483426005132592, "loss": 2.3641, "theoretical_loss": 3.320445203299151, "tokens_seen": 3003711488 }, { "epoch": 0.04, "learning_rate": 0.0004833190761334474, "loss": 2.607, "theoretical_loss": 3.3204396345957834, "tokens_seen": 3003777024 }, { "epoch": 0.04, "learning_rate": 0.00048321214713430285, "loss": 2.6218, "theoretical_loss": 3.3204340660479303, "tokens_seen": 3003842560 }, { "epoch": 0.04, "learning_rate": 0.00048310521813515827, "loss": 2.7135, "theoretical_loss": 3.320428497655584, "tokens_seen": 3003908096 }, { "epoch": 0.04, "learning_rate": 0.0004829982891360137, "loss": 2.7583, "theoretical_loss": 3.320422929418736, "tokens_seen": 3003973632 }, { "epoch": 0.04, "learning_rate": 0.0004828913601368691, "loss": 2.3356, "theoretical_loss": 3.320417361337379, "tokens_seen": 3004039168 }, { "epoch": 0.04, "learning_rate": 0.0004827844311377246, "loss": 2.6196, "theoretical_loss": 3.320411793411506, "tokens_seen": 3004104704 }, { "epoch": 0.04, "learning_rate": 0.00048267750213858, "loss": 2.5513, "theoretical_loss": 3.3204062256411078, "tokens_seen": 3004170240 }, { "epoch": 0.04, "learning_rate": 0.0004825705731394354, "loss": 2.3649, "theoretical_loss": 3.320400658026178, "tokens_seen": 3004235776 }, { "epoch": 0.04, "learning_rate": 0.0004824636441402909, "loss": 2.5991, "theoretical_loss": 3.320395090566708, "tokens_seen": 3004301312 }, { "epoch": 0.05, "learning_rate": 0.00048235671514114625, "loss": 2.5668, "theoretical_loss": 3.3203895232626905, "tokens_seen": 3004366848 }, { "epoch": 0.05, "learning_rate": 0.0004822497861420017, "loss": 2.6184, "theoretical_loss": 3.3203839561141173, "tokens_seen": 3004432384 }, { "epoch": 0.05, "learning_rate": 0.00048214285714285715, "loss": 2.4345, "theoretical_loss": 3.3203783891209815, "tokens_seen": 3004497920 }, { "epoch": 0.05, "learning_rate": 0.0004820359281437126, "loss": 2.702, "theoretical_loss": 3.320372822283275, "tokens_seen": 3004563456 }, { "epoch": 0.05, "learning_rate": 0.000481928999144568, "loss": 2.4543, "theoretical_loss": 3.3203672556009898, "tokens_seen": 3004628992 }, { "epoch": 0.05, "learning_rate": 0.00048182207014542346, "loss": 2.7525, "theoretical_loss": 3.3203616890741183, "tokens_seen": 3004694528 }, { "epoch": 0.05, "learning_rate": 0.00048171514114627893, "loss": 2.5572, "theoretical_loss": 3.320356122702653, "tokens_seen": 3004760064 }, { "epoch": 0.05, "learning_rate": 0.0004816082121471343, "loss": 2.4409, "theoretical_loss": 3.3203505564865856, "tokens_seen": 3004825600 }, { "epoch": 0.05, "learning_rate": 0.00048150128314798977, "loss": 2.5298, "theoretical_loss": 3.320344990425909, "tokens_seen": 3004891136 }, { "epoch": 0.05, "learning_rate": 0.0004813943541488452, "loss": 2.5959, "theoretical_loss": 3.3203394245206153, "tokens_seen": 3004956672 }, { "epoch": 0.05, "learning_rate": 0.0004812874251497006, "loss": 2.5989, "theoretical_loss": 3.3203338587706965, "tokens_seen": 3005022208 }, { "epoch": 0.05, "learning_rate": 0.000481180496150556, "loss": 2.3394, "theoretical_loss": 3.320328293176145, "tokens_seen": 3005087744 }, { "epoch": 0.05, "objective/train/docs_used": 1647973, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.042917013168335, "objective/train/theoretical_loss": 3.3203227277369534, "objective/train/tokens_used": 35204576, "theoretical_loss": 3.3203227277369534, "tokens_seen": 3005153280 }, { "epoch": 0.05, "learning_rate": 0.0004810735671514115, "loss": 2.4204, "theoretical_loss": 3.3203227277369534, "tokens_seen": 3005153280 }, { "epoch": 0.05, "learning_rate": 0.00048096663815226686, "loss": 2.4667, "theoretical_loss": 3.320317162453114, "tokens_seen": 3005218816 }, { "epoch": 0.05, "learning_rate": 0.00048085970915312233, "loss": 2.6191, "theoretical_loss": 3.3203115973246184, "tokens_seen": 3005284352 }, { "epoch": 0.05, "learning_rate": 0.00048075278015397775, "loss": 2.6692, "theoretical_loss": 3.3203060323514593, "tokens_seen": 3005349888 }, { "epoch": 0.05, "learning_rate": 0.0004806458511548332, "loss": 2.6315, "theoretical_loss": 3.320300467533629, "tokens_seen": 3005415424 }, { "epoch": 0.05, "learning_rate": 0.00048053892215568864, "loss": 2.3996, "theoretical_loss": 3.3202949028711197, "tokens_seen": 3005480960 }, { "epoch": 0.05, "learning_rate": 0.00048043199315654406, "loss": 2.6156, "theoretical_loss": 3.320289338363924, "tokens_seen": 3005546496 }, { "epoch": 0.05, "learning_rate": 0.00048032506415739953, "loss": 2.6943, "theoretical_loss": 3.3202837740120335, "tokens_seen": 3005612032 }, { "epoch": 0.05, "learning_rate": 0.0004802181351582549, "loss": 2.5853, "theoretical_loss": 3.320278209815441, "tokens_seen": 3005677568 }, { "epoch": 0.05, "learning_rate": 0.00048011120615911037, "loss": 2.7011, "theoretical_loss": 3.3202726457741387, "tokens_seen": 3005743104 }, { "epoch": 0.05, "learning_rate": 0.0004800042771599658, "loss": 2.6477, "theoretical_loss": 3.320267081888119, "tokens_seen": 3005808640 }, { "epoch": 0.05, "learning_rate": 0.0004798973481608212, "loss": 2.5143, "theoretical_loss": 3.320261518157374, "tokens_seen": 3005874176 }, { "epoch": 0.05, "learning_rate": 0.0004797904191616766, "loss": 2.5315, "theoretical_loss": 3.3202559545818957, "tokens_seen": 3005939712 }, { "epoch": 0.05, "learning_rate": 0.0004796834901625321, "loss": 2.7104, "theoretical_loss": 3.3202503911616765, "tokens_seen": 3006005248 }, { "epoch": 0.05, "learning_rate": 0.00047957656116338757, "loss": 2.313, "theoretical_loss": 3.320244827896709, "tokens_seen": 3006070784 }, { "epoch": 0.05, "learning_rate": 0.00047946963216424294, "loss": 2.346, "theoretical_loss": 3.320239264786986, "tokens_seen": 3006136320 }, { "epoch": 0.05, "learning_rate": 0.0004793627031650984, "loss": 2.4753, "theoretical_loss": 3.3202337018324983, "tokens_seen": 3006201856 }, { "epoch": 0.05, "learning_rate": 0.00047925577416595383, "loss": 2.6649, "theoretical_loss": 3.3202281390332393, "tokens_seen": 3006267392 }, { "epoch": 0.05, "learning_rate": 0.00047914884516680925, "loss": 2.4813, "theoretical_loss": 3.3202225763892015, "tokens_seen": 3006332928 }, { "epoch": 0.05, "learning_rate": 0.00047904191616766467, "loss": 2.581, "theoretical_loss": 3.320217013900376, "tokens_seen": 3006398464 }, { "epoch": 0.05, "learning_rate": 0.00047893498716852014, "loss": 2.5711, "theoretical_loss": 3.320211451566756, "tokens_seen": 3006464000 }, { "epoch": 0.05, "learning_rate": 0.0004788280581693755, "loss": 2.5481, "theoretical_loss": 3.3202058893883333, "tokens_seen": 3006529536 }, { "epoch": 0.05, "learning_rate": 0.000478721129170231, "loss": 2.7041, "theoretical_loss": 3.320200327365101, "tokens_seen": 3006595072 }, { "epoch": 0.05, "learning_rate": 0.00047861420017108645, "loss": 2.5201, "theoretical_loss": 3.3201947654970505, "tokens_seen": 3006660608 }, { "epoch": 0.05, "learning_rate": 0.0004785072711719418, "loss": 2.6215, "theoretical_loss": 3.3201892037841745, "tokens_seen": 3006726144 }, { "epoch": 0.05, "objective/train/docs_used": 1649106, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.610767364501953, "objective/train/theoretical_loss": 3.320183642226465, "objective/train/tokens_used": 36842976, "theoretical_loss": 3.320183642226465, "tokens_seen": 3006791680 }, { "epoch": 0.05, "learning_rate": 0.0004784003421727973, "loss": 2.6786, "theoretical_loss": 3.320183642226465, "tokens_seen": 3006791680 }, { "epoch": 0.05, "learning_rate": 0.0004782934131736527, "loss": 2.6331, "theoretical_loss": 3.3201780808239145, "tokens_seen": 3006857216 }, { "epoch": 0.05, "learning_rate": 0.0004781864841745082, "loss": 2.6597, "theoretical_loss": 3.3201725195765155, "tokens_seen": 3006922752 }, { "epoch": 0.05, "learning_rate": 0.00047807955517536354, "loss": 2.5184, "theoretical_loss": 3.32016695848426, "tokens_seen": 3006988288 }, { "epoch": 0.05, "learning_rate": 0.000477972626176219, "loss": 2.6672, "theoretical_loss": 3.3201613975471402, "tokens_seen": 3007053824 }, { "epoch": 0.05, "learning_rate": 0.00047786569717707443, "loss": 2.6462, "theoretical_loss": 3.3201558367651485, "tokens_seen": 3007119360 }, { "epoch": 0.05, "learning_rate": 0.00047775876817792985, "loss": 2.4911, "theoretical_loss": 3.3201502761382775, "tokens_seen": 3007184896 }, { "epoch": 0.05, "learning_rate": 0.00047765183917878527, "loss": 2.6191, "theoretical_loss": 3.3201447156665194, "tokens_seen": 3007250432 }, { "epoch": 0.05, "learning_rate": 0.00047754491017964074, "loss": 2.4458, "theoretical_loss": 3.320139155349866, "tokens_seen": 3007315968 }, { "epoch": 0.05, "learning_rate": 0.00047743798118049616, "loss": 2.3037, "theoretical_loss": 3.32013359518831, "tokens_seen": 3007381504 }, { "epoch": 0.06, "learning_rate": 0.0004773310521813516, "loss": 2.3071, "theoretical_loss": 3.3201280351818436, "tokens_seen": 3007447040 }, { "epoch": 0.06, "learning_rate": 0.00047722412318220705, "loss": 2.6427, "theoretical_loss": 3.320122475330459, "tokens_seen": 3007512576 }, { "epoch": 0.06, "learning_rate": 0.0004771171941830624, "loss": 2.5393, "theoretical_loss": 3.320116915634149, "tokens_seen": 3007578112 }, { "epoch": 0.06, "learning_rate": 0.0004770102651839179, "loss": 2.2973, "theoretical_loss": 3.320111356092905, "tokens_seen": 3007643648 }, { "epoch": 0.06, "learning_rate": 0.0004769033361847733, "loss": 2.7483, "theoretical_loss": 3.3201057967067205, "tokens_seen": 3007709184 }, { "epoch": 0.06, "learning_rate": 0.0004767964071856288, "loss": 2.5022, "theoretical_loss": 3.320100237475587, "tokens_seen": 3007774720 }, { "epoch": 0.06, "learning_rate": 0.00047668947818648415, "loss": 2.3979, "theoretical_loss": 3.3200946783994962, "tokens_seen": 3007840256 }, { "epoch": 0.06, "learning_rate": 0.0004765825491873396, "loss": 2.651, "theoretical_loss": 3.320089119478441, "tokens_seen": 3007905792 }, { "epoch": 0.06, "learning_rate": 0.0004764756201881951, "loss": 2.6367, "theoretical_loss": 3.3200835607124146, "tokens_seen": 3007971328 }, { "epoch": 0.06, "learning_rate": 0.00047636869118905046, "loss": 2.4628, "theoretical_loss": 3.3200780021014085, "tokens_seen": 3008036864 }, { "epoch": 0.06, "learning_rate": 0.00047626176218990593, "loss": 2.5774, "theoretical_loss": 3.3200724436454143, "tokens_seen": 3008102400 }, { "epoch": 0.06, "learning_rate": 0.00047615483319076135, "loss": 2.7166, "theoretical_loss": 3.320066885344425, "tokens_seen": 3008167936 }, { "epoch": 0.06, "learning_rate": 0.00047604790419161677, "loss": 2.5213, "theoretical_loss": 3.3200613271984336, "tokens_seen": 3008233472 }, { "epoch": 0.06, "learning_rate": 0.0004759409751924722, "loss": 2.6397, "theoretical_loss": 3.320055769207431, "tokens_seen": 3008299008 }, { "epoch": 0.06, "learning_rate": 0.00047583404619332766, "loss": 2.5147, "theoretical_loss": 3.3200502113714108, "tokens_seen": 3008364544 }, { "epoch": 0.06, "objective/train/docs_used": 1649879, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4313642978668213, "objective/train/theoretical_loss": 3.3200446536903643, "objective/train/tokens_used": 38481376, "theoretical_loss": 3.3200446536903643, "tokens_seen": 3008430080 }, { "epoch": 0.06, "learning_rate": 0.000475727117194183, "loss": 2.7199, "theoretical_loss": 3.3200446536903643, "tokens_seen": 3008430080 }, { "epoch": 0.06, "learning_rate": 0.0004756201881950385, "loss": 2.429, "theoretical_loss": 3.3200390961642845, "tokens_seen": 3008495616 }, { "epoch": 0.06, "learning_rate": 0.00047551325919589397, "loss": 2.7449, "theoretical_loss": 3.320033538793163, "tokens_seen": 3008561152 }, { "epoch": 0.06, "learning_rate": 0.0004754063301967494, "loss": 2.2979, "theoretical_loss": 3.3200279815769926, "tokens_seen": 3008626688 }, { "epoch": 0.06, "learning_rate": 0.0004752994011976048, "loss": 2.5655, "theoretical_loss": 3.320022424515766, "tokens_seen": 3008692224 }, { "epoch": 0.06, "learning_rate": 0.0004751924721984602, "loss": 2.3837, "theoretical_loss": 3.3200168676094743, "tokens_seen": 3008757760 }, { "epoch": 0.06, "learning_rate": 0.0004750855431993157, "loss": 2.3228, "theoretical_loss": 3.320011310858111, "tokens_seen": 3008823296 }, { "epoch": 0.06, "learning_rate": 0.00047497861420017106, "loss": 2.4002, "theoretical_loss": 3.320005754261668, "tokens_seen": 3008888832 }, { "epoch": 0.06, "learning_rate": 0.00047487168520102653, "loss": 2.5257, "theoretical_loss": 3.320000197820137, "tokens_seen": 3008954368 }, { "epoch": 0.06, "learning_rate": 0.00047476475620188195, "loss": 2.6074, "theoretical_loss": 3.319994641533511, "tokens_seen": 3009019904 }, { "epoch": 0.06, "learning_rate": 0.00047465782720273737, "loss": 2.6883, "theoretical_loss": 3.3199890854017826, "tokens_seen": 3009085440 }, { "epoch": 0.06, "learning_rate": 0.00047455089820359284, "loss": 2.3395, "theoretical_loss": 3.319983529424943, "tokens_seen": 3009150976 }, { "epoch": 0.06, "learning_rate": 0.00047444396920444826, "loss": 2.4104, "theoretical_loss": 3.319977973602986, "tokens_seen": 3009216512 }, { "epoch": 0.06, "learning_rate": 0.00047433704020530374, "loss": 2.4436, "theoretical_loss": 3.3199724179359027, "tokens_seen": 3009282048 }, { "epoch": 0.06, "learning_rate": 0.0004742301112061591, "loss": 2.3034, "theoretical_loss": 3.3199668624236853, "tokens_seen": 3009347584 }, { "epoch": 0.06, "learning_rate": 0.00047412318220701457, "loss": 2.4065, "theoretical_loss": 3.319961307066327, "tokens_seen": 3009413120 }, { "epoch": 0.06, "learning_rate": 0.00047401625320787, "loss": 2.5668, "theoretical_loss": 3.31995575186382, "tokens_seen": 3009478656 }, { "epoch": 0.06, "learning_rate": 0.0004739093242087254, "loss": 2.5791, "theoretical_loss": 3.3199501968161558, "tokens_seen": 3009544192 }, { "epoch": 0.06, "learning_rate": 0.00047380239520958083, "loss": 2.6598, "theoretical_loss": 3.3199446419233274, "tokens_seen": 3009609728 }, { "epoch": 0.06, "learning_rate": 0.0004736954662104363, "loss": 2.7689, "theoretical_loss": 3.319939087185327, "tokens_seen": 3009675264 }, { "epoch": 0.06, "learning_rate": 0.00047358853721129167, "loss": 2.6223, "theoretical_loss": 3.319933532602147, "tokens_seen": 3009740800 }, { "epoch": 0.06, "learning_rate": 0.00047348160821214714, "loss": 2.4611, "theoretical_loss": 3.3199279781737796, "tokens_seen": 3009806336 }, { "epoch": 0.06, "learning_rate": 0.0004733746792130026, "loss": 2.5834, "theoretical_loss": 3.3199224239002167, "tokens_seen": 3009871872 }, { "epoch": 0.06, "learning_rate": 0.000473267750213858, "loss": 2.7345, "theoretical_loss": 3.3199168697814514, "tokens_seen": 3009937408 }, { "epoch": 0.06, "learning_rate": 0.00047316082121471345, "loss": 2.456, "theoretical_loss": 3.3199113158174756, "tokens_seen": 3010002944 }, { "epoch": 0.06, "objective/train/docs_used": 1651225, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.715864896774292, "objective/train/theoretical_loss": 3.3199057620082812, "objective/train/tokens_used": 40119776, "theoretical_loss": 3.3199057620082812, "tokens_seen": 3010068480 }, { "epoch": 0.06, "learning_rate": 0.00047305389221556887, "loss": 2.441, "theoretical_loss": 3.3199057620082812, "tokens_seen": 3010068480 }, { "epoch": 0.06, "learning_rate": 0.00047294696321642434, "loss": 2.4536, "theoretical_loss": 3.3199002083538613, "tokens_seen": 3010134016 }, { "epoch": 0.06, "learning_rate": 0.0004728400342172797, "loss": 2.6436, "theoretical_loss": 3.319894654854208, "tokens_seen": 3010199552 }, { "epoch": 0.06, "learning_rate": 0.0004727331052181352, "loss": 2.3205, "theoretical_loss": 3.319889101509313, "tokens_seen": 3010265088 }, { "epoch": 0.06, "learning_rate": 0.0004726261762189906, "loss": 2.6071, "theoretical_loss": 3.3198835483191695, "tokens_seen": 3010330624 }, { "epoch": 0.06, "learning_rate": 0.000472519247219846, "loss": 2.6072, "theoretical_loss": 3.319877995283769, "tokens_seen": 3010396160 }, { "epoch": 0.06, "learning_rate": 0.0004724123182207015, "loss": 2.5943, "theoretical_loss": 3.319872442403105, "tokens_seen": 3010461696 }, { "epoch": 0.06, "learning_rate": 0.0004723053892215569, "loss": 2.4161, "theoretical_loss": 3.3198668896771686, "tokens_seen": 3010527232 }, { "epoch": 0.07, "learning_rate": 0.0004721984602224123, "loss": 2.5087, "theoretical_loss": 3.3198613371059524, "tokens_seen": 3010592768 }, { "epoch": 0.07, "learning_rate": 0.00047209153122326774, "loss": 2.5444, "theoretical_loss": 3.319855784689449, "tokens_seen": 3010658304 }, { "epoch": 0.07, "learning_rate": 0.0004719846022241232, "loss": 2.6907, "theoretical_loss": 3.319850232427651, "tokens_seen": 3010723840 }, { "epoch": 0.07, "learning_rate": 0.0004718776732249786, "loss": 2.4364, "theoretical_loss": 3.31984468032055, "tokens_seen": 3010789376 }, { "epoch": 0.07, "learning_rate": 0.00047177074422583405, "loss": 2.3893, "theoretical_loss": 3.3198391283681383, "tokens_seen": 3010854912 }, { "epoch": 0.07, "learning_rate": 0.00047166381522668947, "loss": 2.5048, "theoretical_loss": 3.3198335765704092, "tokens_seen": 3010920448 }, { "epoch": 0.07, "learning_rate": 0.00047155688622754495, "loss": 2.6298, "theoretical_loss": 3.3198280249273546, "tokens_seen": 3010985984 }, { "epoch": 0.07, "learning_rate": 0.00047144995722840036, "loss": 2.8107, "theoretical_loss": 3.319822473438966, "tokens_seen": 3011051520 }, { "epoch": 0.07, "learning_rate": 0.0004713430282292558, "loss": 2.503, "theoretical_loss": 3.319816922105237, "tokens_seen": 3011117056 }, { "epoch": 0.07, "learning_rate": 0.00047123609923011126, "loss": 2.5555, "theoretical_loss": 3.3198113709261587, "tokens_seen": 3011182592 }, { "epoch": 0.07, "learning_rate": 0.0004711291702309666, "loss": 2.4985, "theoretical_loss": 3.319805819901724, "tokens_seen": 3011248128 }, { "epoch": 0.07, "learning_rate": 0.0004710222412318221, "loss": 2.4865, "theoretical_loss": 3.3198002690319255, "tokens_seen": 3011313664 }, { "epoch": 0.07, "learning_rate": 0.0004709153122326775, "loss": 2.5541, "theoretical_loss": 3.3197947183167553, "tokens_seen": 3011379200 }, { "epoch": 0.07, "learning_rate": 0.00047080838323353293, "loss": 2.4878, "theoretical_loss": 3.319789167756206, "tokens_seen": 3011444736 }, { "epoch": 0.07, "learning_rate": 0.00047070145423438835, "loss": 2.4241, "theoretical_loss": 3.319783617350269, "tokens_seen": 3011510272 }, { "epoch": 0.07, "learning_rate": 0.0004705945252352438, "loss": 2.6346, "theoretical_loss": 3.3197780670989374, "tokens_seen": 3011575808 }, { "epoch": 0.07, "learning_rate": 0.00047048759623609924, "loss": 2.3981, "theoretical_loss": 3.319772517002204, "tokens_seen": 3011641344 }, { "epoch": 0.07, "objective/train/docs_used": 1651868, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.2719619274139404, "objective/train/theoretical_loss": 3.31976696706006, "objective/train/tokens_used": 41758176, "theoretical_loss": 3.31976696706006, "tokens_seen": 3011706880 }, { "epoch": 0.07, "learning_rate": 0.00047038066723695466, "loss": 2.4118, "theoretical_loss": 3.31976696706006, "tokens_seen": 3011706880 }, { "epoch": 0.07, "learning_rate": 0.00047027373823781013, "loss": 2.6711, "theoretical_loss": 3.319761417272498, "tokens_seen": 3011772416 }, { "epoch": 0.07, "learning_rate": 0.00047016680923866555, "loss": 2.6537, "theoretical_loss": 3.319755867639511, "tokens_seen": 3011837952 }, { "epoch": 0.07, "learning_rate": 0.00047005988023952097, "loss": 2.6158, "theoretical_loss": 3.319750318161091, "tokens_seen": 3011903488 }, { "epoch": 0.07, "learning_rate": 0.0004699529512403764, "loss": 2.2521, "theoretical_loss": 3.3197447688372295, "tokens_seen": 3011969024 }, { "epoch": 0.07, "learning_rate": 0.00046984602224123186, "loss": 2.6413, "theoretical_loss": 3.3197392196679205, "tokens_seen": 3012034560 }, { "epoch": 0.07, "learning_rate": 0.0004697390932420872, "loss": 2.7066, "theoretical_loss": 3.3197336706531546, "tokens_seen": 3012100096 }, { "epoch": 0.07, "learning_rate": 0.0004696321642429427, "loss": 2.4956, "theoretical_loss": 3.3197281217929255, "tokens_seen": 3012165632 }, { "epoch": 0.07, "learning_rate": 0.00046952523524379817, "loss": 2.7685, "theoretical_loss": 3.319722573087225, "tokens_seen": 3012231168 }, { "epoch": 0.07, "learning_rate": 0.00046941830624465353, "loss": 2.6271, "theoretical_loss": 3.319717024536045, "tokens_seen": 3012296704 }, { "epoch": 0.07, "learning_rate": 0.000469311377245509, "loss": 2.6398, "theoretical_loss": 3.3197114761393784, "tokens_seen": 3012362240 }, { "epoch": 0.07, "learning_rate": 0.0004692044482463644, "loss": 2.5921, "theoretical_loss": 3.3197059278972176, "tokens_seen": 3012427776 }, { "epoch": 0.07, "learning_rate": 0.00046909751924721984, "loss": 2.6227, "theoretical_loss": 3.3197003798095546, "tokens_seen": 3012493312 }, { "epoch": 0.07, "learning_rate": 0.00046899059024807526, "loss": 2.6272, "theoretical_loss": 3.3196948318763817, "tokens_seen": 3012558848 }, { "epoch": 0.07, "learning_rate": 0.00046888366124893074, "loss": 2.4564, "theoretical_loss": 3.3196892840976915, "tokens_seen": 3012624384 }, { "epoch": 0.07, "learning_rate": 0.00046877673224978616, "loss": 2.5827, "theoretical_loss": 3.319683736473476, "tokens_seen": 3012689920 }, { "epoch": 0.07, "learning_rate": 0.0004686698032506416, "loss": 2.5221, "theoretical_loss": 3.319678189003728, "tokens_seen": 3012755456 }, { "epoch": 0.07, "learning_rate": 0.000468562874251497, "loss": 2.2411, "theoretical_loss": 3.3196726416884395, "tokens_seen": 3012820992 }, { "epoch": 0.07, "learning_rate": 0.00046845594525235247, "loss": 2.5399, "theoretical_loss": 3.319667094527603, "tokens_seen": 3012886528 }, { "epoch": 0.07, "learning_rate": 0.0004683490162532079, "loss": 2.5599, "theoretical_loss": 3.3196615475212106, "tokens_seen": 3012952064 }, { "epoch": 0.07, "learning_rate": 0.0004682420872540633, "loss": 2.6186, "theoretical_loss": 3.319656000669255, "tokens_seen": 3013017600 }, { "epoch": 0.07, "learning_rate": 0.0004681351582549188, "loss": 2.4364, "theoretical_loss": 3.3196504539717284, "tokens_seen": 3013083136 }, { "epoch": 0.07, "learning_rate": 0.00046802822925577414, "loss": 2.4645, "theoretical_loss": 3.319644907428623, "tokens_seen": 3013148672 }, { "epoch": 0.07, "learning_rate": 0.0004679213002566296, "loss": 2.3282, "theoretical_loss": 3.3196393610399317, "tokens_seen": 3013214208 }, { "epoch": 0.07, "learning_rate": 0.00046781437125748503, "loss": 2.3968, "theoretical_loss": 3.3196338148056457, "tokens_seen": 3013279744 }, { "epoch": 0.07, "objective/train/docs_used": 1652881, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.02851939201355, "objective/train/theoretical_loss": 3.3196282687257583, "objective/train/tokens_used": 43396576, "theoretical_loss": 3.3196282687257583, "tokens_seen": 3013345280 }, { "epoch": 0.07, "learning_rate": 0.0004677074422583405, "loss": 2.73, "theoretical_loss": 3.3196282687257583, "tokens_seen": 3013345280 }, { "epoch": 0.07, "learning_rate": 0.00046760051325919587, "loss": 2.7081, "theoretical_loss": 3.3196227228002617, "tokens_seen": 3013410816 }, { "epoch": 0.07, "learning_rate": 0.00046749358426005134, "loss": 2.4167, "theoretical_loss": 3.3196171770291483, "tokens_seen": 3013476352 }, { "epoch": 0.07, "learning_rate": 0.0004673866552609068, "loss": 2.6501, "theoretical_loss": 3.31961163141241, "tokens_seen": 3013541888 }, { "epoch": 0.07, "learning_rate": 0.0004672797262617622, "loss": 2.711, "theoretical_loss": 3.3196060859500394, "tokens_seen": 3013607424 }, { "epoch": 0.08, "learning_rate": 0.00046717279726261765, "loss": 2.6538, "theoretical_loss": 3.319600540642029, "tokens_seen": 3013672960 }, { "epoch": 0.08, "learning_rate": 0.00046706586826347307, "loss": 2.7554, "theoretical_loss": 3.319594995488371, "tokens_seen": 3013738496 }, { "epoch": 0.08, "learning_rate": 0.0004669589392643285, "loss": 2.6815, "theoretical_loss": 3.3195894504890573, "tokens_seen": 3013804032 }, { "epoch": 0.08, "learning_rate": 0.0004668520102651839, "loss": 2.4327, "theoretical_loss": 3.3195839056440812, "tokens_seen": 3013869568 }, { "epoch": 0.08, "learning_rate": 0.0004667450812660394, "loss": 2.6057, "theoretical_loss": 3.3195783609534346, "tokens_seen": 3013935104 }, { "epoch": 0.08, "learning_rate": 0.00046663815226689474, "loss": 2.8111, "theoretical_loss": 3.3195728164171094, "tokens_seen": 3014000640 }, { "epoch": 0.08, "learning_rate": 0.0004665312232677502, "loss": 2.5688, "theoretical_loss": 3.3195672720350986, "tokens_seen": 3014066176 }, { "epoch": 0.08, "learning_rate": 0.0004664242942686057, "loss": 2.4503, "theoretical_loss": 3.319561727807394, "tokens_seen": 3014131712 }, { "epoch": 0.08, "learning_rate": 0.0004663173652694611, "loss": 2.6877, "theoretical_loss": 3.319556183733989, "tokens_seen": 3014197248 }, { "epoch": 0.08, "learning_rate": 0.00046621043627031653, "loss": 2.6751, "theoretical_loss": 3.3195506398148744, "tokens_seen": 3014262784 }, { "epoch": 0.08, "learning_rate": 0.00046610350727117195, "loss": 2.4173, "theoretical_loss": 3.3195450960500437, "tokens_seen": 3014328320 }, { "epoch": 0.08, "learning_rate": 0.0004659965782720274, "loss": 2.6038, "theoretical_loss": 3.319539552439489, "tokens_seen": 3014393856 }, { "epoch": 0.08, "learning_rate": 0.0004658896492728828, "loss": 2.638, "theoretical_loss": 3.3195340089832026, "tokens_seen": 3014459392 }, { "epoch": 0.08, "learning_rate": 0.00046578272027373826, "loss": 2.5091, "theoretical_loss": 3.3195284656811763, "tokens_seen": 3014524928 }, { "epoch": 0.08, "learning_rate": 0.0004656757912745937, "loss": 2.807, "theoretical_loss": 3.3195229225334035, "tokens_seen": 3014590464 }, { "epoch": 0.08, "learning_rate": 0.0004655688622754491, "loss": 2.3861, "theoretical_loss": 3.319517379539876, "tokens_seen": 3014656000 }, { "epoch": 0.08, "learning_rate": 0.0004654619332763045, "loss": 2.8883, "theoretical_loss": 3.3195118367005856, "tokens_seen": 3014721536 }, { "epoch": 0.08, "learning_rate": 0.00046535500427716, "loss": 2.6202, "theoretical_loss": 3.3195062940155258, "tokens_seen": 3014787072 }, { "epoch": 0.08, "learning_rate": 0.0004652480752780154, "loss": 2.6868, "theoretical_loss": 3.3195007514846884, "tokens_seen": 3014852608 }, { "epoch": 0.08, "learning_rate": 0.0004651411462788708, "loss": 2.4873, "theoretical_loss": 3.3194952091080654, "tokens_seen": 3014918144 }, { "epoch": 0.08, "objective/train/docs_used": 1653310, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 1.8691486120224, "objective/train/theoretical_loss": 3.3194896668856497, "objective/train/tokens_used": 45034976, "theoretical_loss": 3.3194896668856497, "tokens_seen": 3014983680 }, { "epoch": 0.08, "learning_rate": 0.0004650342172797263, "loss": 2.2912, "theoretical_loss": 3.3194896668856497, "tokens_seen": 3014983680 }, { "epoch": 0.08, "learning_rate": 0.0004649272882805817, "loss": 2.609, "theoretical_loss": 3.3194841248174334, "tokens_seen": 3015049216 }, { "epoch": 0.08, "learning_rate": 0.00046482035928143713, "loss": 2.77, "theoretical_loss": 3.319478582903409, "tokens_seen": 3015114752 }, { "epoch": 0.08, "learning_rate": 0.00046471343028229255, "loss": 2.8639, "theoretical_loss": 3.3194730411435684, "tokens_seen": 3015180288 }, { "epoch": 0.08, "learning_rate": 0.000464606501283148, "loss": 2.601, "theoretical_loss": 3.3194674995379048, "tokens_seen": 3015245824 }, { "epoch": 0.08, "learning_rate": 0.0004644995722840034, "loss": 2.2598, "theoretical_loss": 3.3194619580864098, "tokens_seen": 3015311360 }, { "epoch": 0.08, "learning_rate": 0.00046439264328485886, "loss": 2.6682, "theoretical_loss": 3.3194564167890763, "tokens_seen": 3015376896 }, { "epoch": 0.08, "learning_rate": 0.00046428571428571433, "loss": 2.5971, "theoretical_loss": 3.3194508756458965, "tokens_seen": 3015442432 }, { "epoch": 0.08, "learning_rate": 0.0004641787852865697, "loss": 2.5004, "theoretical_loss": 3.319445334656862, "tokens_seen": 3015507968 }, { "epoch": 0.08, "learning_rate": 0.00046407185628742517, "loss": 2.7767, "theoretical_loss": 3.319439793821967, "tokens_seen": 3015573504 }, { "epoch": 0.08, "learning_rate": 0.0004639649272882806, "loss": 2.6512, "theoretical_loss": 3.3194342531412016, "tokens_seen": 3015639040 }, { "epoch": 0.08, "learning_rate": 0.000463857998289136, "loss": 2.6137, "theoretical_loss": 3.3194287126145596, "tokens_seen": 3015704576 }, { "epoch": 0.08, "learning_rate": 0.00046375106928999143, "loss": 2.4597, "theoretical_loss": 3.319423172242033, "tokens_seen": 3015770112 }, { "epoch": 0.08, "learning_rate": 0.0004636441402908469, "loss": 2.5436, "theoretical_loss": 3.3194176320236144, "tokens_seen": 3015835648 }, { "epoch": 0.08, "learning_rate": 0.0004635372112917023, "loss": 2.5871, "theoretical_loss": 3.319412091959296, "tokens_seen": 3015901184 }, { "epoch": 0.08, "learning_rate": 0.00046343028229255774, "loss": 2.709, "theoretical_loss": 3.31940655204907, "tokens_seen": 3015966720 }, { "epoch": 0.08, "learning_rate": 0.0004633233532934132, "loss": 2.7242, "theoretical_loss": 3.319401012292929, "tokens_seen": 3016032256 }, { "epoch": 0.08, "learning_rate": 0.00046321642429426863, "loss": 2.4904, "theoretical_loss": 3.319395472690865, "tokens_seen": 3016097792 }, { "epoch": 0.08, "learning_rate": 0.00046310949529512405, "loss": 2.4424, "theoretical_loss": 3.3193899332428707, "tokens_seen": 3016163328 }, { "epoch": 0.08, "learning_rate": 0.00046300256629597947, "loss": 2.5032, "theoretical_loss": 3.3193843939489382, "tokens_seen": 3016228864 }, { "epoch": 0.08, "learning_rate": 0.00046289563729683494, "loss": 2.702, "theoretical_loss": 3.3193788548090604, "tokens_seen": 3016294400 }, { "epoch": 0.08, "learning_rate": 0.0004627887082976903, "loss": 2.622, "theoretical_loss": 3.319373315823229, "tokens_seen": 3016359936 }, { "epoch": 0.08, "learning_rate": 0.0004626817792985458, "loss": 2.7673, "theoretical_loss": 3.319367776991437, "tokens_seen": 3016425472 }, { "epoch": 0.08, "learning_rate": 0.0004625748502994012, "loss": 2.4798, "theoretical_loss": 3.3193622383136763, "tokens_seen": 3016491008 }, { "epoch": 0.08, "learning_rate": 0.00046246792130025667, "loss": 2.919, "theoretical_loss": 3.3193566997899397, "tokens_seen": 3016556544 }, { "epoch": 0.08, "objective/train/docs_used": 1654610, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.208578109741211, "objective/train/theoretical_loss": 3.3193511614202187, "objective/train/tokens_used": 46673376, "theoretical_loss": 3.3193511614202187, "tokens_seen": 3016622080 }, { "epoch": 0.08, "learning_rate": 0.0004623609923011121, "loss": 2.5259, "theoretical_loss": 3.3193511614202187, "tokens_seen": 3016622080 }, { "epoch": 0.08, "learning_rate": 0.0004622540633019675, "loss": 2.5413, "theoretical_loss": 3.319345623204507, "tokens_seen": 3016687616 }, { "epoch": 0.09, "learning_rate": 0.000462147134302823, "loss": 2.7374, "theoretical_loss": 3.319340085142796, "tokens_seen": 3016753152 }, { "epoch": 0.09, "learning_rate": 0.00046204020530367834, "loss": 2.4026, "theoretical_loss": 3.319334547235078, "tokens_seen": 3016818688 }, { "epoch": 0.09, "learning_rate": 0.0004619332763045338, "loss": 2.5737, "theoretical_loss": 3.319329009481346, "tokens_seen": 3016884224 }, { "epoch": 0.09, "learning_rate": 0.00046182634730538923, "loss": 2.4429, "theoretical_loss": 3.319323471881592, "tokens_seen": 3016949760 }, { "epoch": 0.09, "learning_rate": 0.00046171941830624465, "loss": 2.6043, "theoretical_loss": 3.3193179344358086, "tokens_seen": 3017015296 }, { "epoch": 0.09, "learning_rate": 0.00046161248930710007, "loss": 2.5731, "theoretical_loss": 3.3193123971439875, "tokens_seen": 3017080832 }, { "epoch": 0.09, "learning_rate": 0.00046150556030795554, "loss": 2.6597, "theoretical_loss": 3.319306860006122, "tokens_seen": 3017146368 }, { "epoch": 0.09, "learning_rate": 0.0004613986313088109, "loss": 2.5896, "theoretical_loss": 3.319301323022204, "tokens_seen": 3017211904 }, { "epoch": 0.09, "learning_rate": 0.0004612917023096664, "loss": 2.7416, "theoretical_loss": 3.319295786192226, "tokens_seen": 3017277440 }, { "epoch": 0.09, "learning_rate": 0.00046118477331052185, "loss": 2.763, "theoretical_loss": 3.3192902495161802, "tokens_seen": 3017342976 }, { "epoch": 0.09, "learning_rate": 0.00046107784431137727, "loss": 2.6935, "theoretical_loss": 3.319284712994059, "tokens_seen": 3017408512 }, { "epoch": 0.09, "learning_rate": 0.0004609709153122327, "loss": 2.6153, "theoretical_loss": 3.319279176625855, "tokens_seen": 3017474048 }, { "epoch": 0.09, "learning_rate": 0.0004608639863130881, "loss": 2.5454, "theoretical_loss": 3.3192736404115606, "tokens_seen": 3017539584 }, { "epoch": 0.09, "learning_rate": 0.0004607570573139436, "loss": 2.564, "theoretical_loss": 3.319268104351168, "tokens_seen": 3017605120 }, { "epoch": 0.09, "learning_rate": 0.00046065012831479895, "loss": 2.4576, "theoretical_loss": 3.3192625684446693, "tokens_seen": 3017670656 }, { "epoch": 0.09, "learning_rate": 0.0004605431993156544, "loss": 2.8301, "theoretical_loss": 3.3192570326920574, "tokens_seen": 3017736192 }, { "epoch": 0.09, "learning_rate": 0.00046043627031650984, "loss": 2.5667, "theoretical_loss": 3.3192514970933242, "tokens_seen": 3017801728 }, { "epoch": 0.09, "learning_rate": 0.00046032934131736526, "loss": 2.2958, "theoretical_loss": 3.3192459616484626, "tokens_seen": 3017867264 }, { "epoch": 0.09, "learning_rate": 0.00046022241231822073, "loss": 2.5172, "theoretical_loss": 3.319240426357465, "tokens_seen": 3017932800 }, { "epoch": 0.09, "learning_rate": 0.00046011548331907615, "loss": 2.6184, "theoretical_loss": 3.319234891220323, "tokens_seen": 3017998336 }, { "epoch": 0.09, "learning_rate": 0.00046000855431993157, "loss": 2.6125, "theoretical_loss": 3.31922935623703, "tokens_seen": 3018063872 }, { "epoch": 0.09, "learning_rate": 0.000459901625320787, "loss": 2.5618, "theoretical_loss": 3.3192238214075775, "tokens_seen": 3018129408 }, { "epoch": 0.09, "learning_rate": 0.00045979469632164246, "loss": 2.6876, "theoretical_loss": 3.3192182867319584, "tokens_seen": 3018194944 }, { "epoch": 0.09, "objective/train/docs_used": 1655301, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.565507411956787, "objective/train/theoretical_loss": 3.319212752210165, "objective/train/tokens_used": 48311776, "theoretical_loss": 3.319212752210165, "tokens_seen": 3018260480 }, { "epoch": 0.09, "learning_rate": 0.0004596877673224979, "loss": 2.4072, "theoretical_loss": 3.319212752210165, "tokens_seen": 3018260480 }, { "epoch": 0.09, "learning_rate": 0.0004595808383233533, "loss": 2.565, "theoretical_loss": 3.3192072178421896, "tokens_seen": 3018326016 }, { "epoch": 0.09, "learning_rate": 0.0004594739093242087, "loss": 2.7793, "theoretical_loss": 3.3192016836280245, "tokens_seen": 3018391552 }, { "epoch": 0.09, "learning_rate": 0.0004593669803250642, "loss": 2.587, "theoretical_loss": 3.319196149567662, "tokens_seen": 3018457088 }, { "epoch": 0.09, "learning_rate": 0.0004592600513259196, "loss": 2.6595, "theoretical_loss": 3.3191906156610953, "tokens_seen": 3018522624 }, { "epoch": 0.09, "learning_rate": 0.000459153122326775, "loss": 2.6921, "theoretical_loss": 3.3191850819083157, "tokens_seen": 3018588160 }, { "epoch": 0.09, "learning_rate": 0.0004590461933276305, "loss": 2.5763, "theoretical_loss": 3.3191795483093163, "tokens_seen": 3018653696 }, { "epoch": 0.09, "learning_rate": 0.00045893926432848586, "loss": 2.5511, "theoretical_loss": 3.319174014864089, "tokens_seen": 3018719232 }, { "epoch": 0.09, "learning_rate": 0.00045883233532934134, "loss": 2.4876, "theoretical_loss": 3.3191684815726266, "tokens_seen": 3018784768 }, { "epoch": 0.09, "learning_rate": 0.00045872540633019675, "loss": 2.5574, "theoretical_loss": 3.319162948434921, "tokens_seen": 3018850304 }, { "epoch": 0.09, "learning_rate": 0.00045861847733105217, "loss": 2.6569, "theoretical_loss": 3.3191574154509658, "tokens_seen": 3018915840 }, { "epoch": 0.09, "learning_rate": 0.0004585115483319076, "loss": 2.6353, "theoretical_loss": 3.319151882620752, "tokens_seen": 3018981376 }, { "epoch": 0.09, "learning_rate": 0.00045840461933276306, "loss": 2.6696, "theoretical_loss": 3.319146349944272, "tokens_seen": 3019046912 }, { "epoch": 0.09, "learning_rate": 0.00045829769033361854, "loss": 2.5971, "theoretical_loss": 3.3191408174215193, "tokens_seen": 3019112448 }, { "epoch": 0.09, "learning_rate": 0.0004581907613344739, "loss": 2.6662, "theoretical_loss": 3.319135285052486, "tokens_seen": 3019177984 }, { "epoch": 0.09, "learning_rate": 0.0004580838323353294, "loss": 2.6373, "theoretical_loss": 3.3191297528371635, "tokens_seen": 3019243520 }, { "epoch": 0.09, "learning_rate": 0.0004579769033361848, "loss": 2.4068, "theoretical_loss": 3.319124220775545, "tokens_seen": 3019309056 }, { "epoch": 0.09, "learning_rate": 0.0004578699743370402, "loss": 2.5871, "theoretical_loss": 3.319118688867623, "tokens_seen": 3019374592 }, { "epoch": 0.09, "learning_rate": 0.00045776304533789563, "loss": 2.5294, "theoretical_loss": 3.319113157113389, "tokens_seen": 3019440128 }, { "epoch": 0.09, "learning_rate": 0.0004576561163387511, "loss": 2.4393, "theoretical_loss": 3.319107625512837, "tokens_seen": 3019505664 }, { "epoch": 0.09, "learning_rate": 0.00045754918733960647, "loss": 2.6992, "theoretical_loss": 3.3191020940659577, "tokens_seen": 3019571200 }, { "epoch": 0.09, "learning_rate": 0.00045744225834046194, "loss": 2.5488, "theoretical_loss": 3.3190965627727445, "tokens_seen": 3019636736 }, { "epoch": 0.09, "learning_rate": 0.00045733532934131736, "loss": 2.5638, "theoretical_loss": 3.3190910316331896, "tokens_seen": 3019702272 }, { "epoch": 0.09, "learning_rate": 0.00045722840034217283, "loss": 2.6045, "theoretical_loss": 3.3190855006472857, "tokens_seen": 3019767808 }, { "epoch": 0.1, "learning_rate": 0.00045712147134302825, "loss": 2.5425, "theoretical_loss": 3.319079969815024, "tokens_seen": 3019833344 }, { "epoch": 0.1, "objective/train/docs_used": 1656638, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6638667583465576, "objective/train/theoretical_loss": 3.3190744391363984, "objective/train/tokens_used": 49950176, "theoretical_loss": 3.3190744391363984, "tokens_seen": 3019898880 }, { "epoch": 0.1, "learning_rate": 0.00045701454234388367, "loss": 2.6716, "theoretical_loss": 3.3190744391363984, "tokens_seen": 3019898880 }, { "epoch": 0.1, "learning_rate": 0.00045690761334473914, "loss": 2.6018, "theoretical_loss": 3.3190689086114005, "tokens_seen": 3019964416 }, { "epoch": 0.1, "learning_rate": 0.0004568006843455945, "loss": 2.4702, "theoretical_loss": 3.3190633782400223, "tokens_seen": 3020029952 }, { "epoch": 0.1, "learning_rate": 0.00045669375534645, "loss": 2.5531, "theoretical_loss": 3.3190578480222577, "tokens_seen": 3020095488 }, { "epoch": 0.1, "learning_rate": 0.0004565868263473054, "loss": 2.5263, "theoretical_loss": 3.3190523179580973, "tokens_seen": 3020161024 }, { "epoch": 0.1, "learning_rate": 0.0004564798973481608, "loss": 2.6229, "theoretical_loss": 3.3190467880475345, "tokens_seen": 3020226560 }, { "epoch": 0.1, "learning_rate": 0.00045637296834901623, "loss": 2.479, "theoretical_loss": 3.3190412582905617, "tokens_seen": 3020292096 }, { "epoch": 0.1, "learning_rate": 0.0004562660393498717, "loss": 2.6546, "theoretical_loss": 3.319035728687171, "tokens_seen": 3020357632 }, { "epoch": 0.1, "learning_rate": 0.0004561591103507271, "loss": 2.5858, "theoretical_loss": 3.319030199237355, "tokens_seen": 3020423168 }, { "epoch": 0.1, "learning_rate": 0.00045605218135158254, "loss": 2.3675, "theoretical_loss": 3.319024669941106, "tokens_seen": 3020488704 }, { "epoch": 0.1, "learning_rate": 0.000455945252352438, "loss": 2.5846, "theoretical_loss": 3.3190191407984164, "tokens_seen": 3020554240 }, { "epoch": 0.1, "learning_rate": 0.00045583832335329344, "loss": 2.4535, "theoretical_loss": 3.319013611809279, "tokens_seen": 3020619776 }, { "epoch": 0.1, "learning_rate": 0.00045573139435414885, "loss": 2.7591, "theoretical_loss": 3.3190080829736854, "tokens_seen": 3020685312 }, { "epoch": 0.1, "learning_rate": 0.0004556244653550043, "loss": 2.5729, "theoretical_loss": 3.319002554291629, "tokens_seen": 3020750848 }, { "epoch": 0.1, "learning_rate": 0.00045551753635585975, "loss": 2.5332, "theoretical_loss": 3.318997025763101, "tokens_seen": 3020816384 }, { "epoch": 0.1, "learning_rate": 0.0004554106073567151, "loss": 2.5638, "theoretical_loss": 3.3189914973880947, "tokens_seen": 3020881920 }, { "epoch": 0.1, "learning_rate": 0.0004553036783575706, "loss": 2.476, "theoretical_loss": 3.318985969166602, "tokens_seen": 3020947456 }, { "epoch": 0.1, "learning_rate": 0.00045519674935842606, "loss": 2.7148, "theoretical_loss": 3.318980441098616, "tokens_seen": 3021012992 }, { "epoch": 0.1, "learning_rate": 0.0004550898203592814, "loss": 2.5545, "theoretical_loss": 3.3189749131841286, "tokens_seen": 3021078528 }, { "epoch": 0.1, "learning_rate": 0.0004549828913601369, "loss": 2.5361, "theoretical_loss": 3.3189693854231326, "tokens_seen": 3021144064 }, { "epoch": 0.1, "learning_rate": 0.0004548759623609923, "loss": 2.7275, "theoretical_loss": 3.3189638578156195, "tokens_seen": 3021209600 }, { "epoch": 0.1, "learning_rate": 0.00045476903336184773, "loss": 2.7548, "theoretical_loss": 3.3189583303615824, "tokens_seen": 3021275136 }, { "epoch": 0.1, "learning_rate": 0.00045466210436270315, "loss": 2.6902, "theoretical_loss": 3.3189528030610136, "tokens_seen": 3021340672 }, { "epoch": 0.1, "learning_rate": 0.0004545551753635586, "loss": 2.3308, "theoretical_loss": 3.318947275913906, "tokens_seen": 3021406208 }, { "epoch": 0.1, "learning_rate": 0.00045444824636441404, "loss": 2.437, "theoretical_loss": 3.318941748920251, "tokens_seen": 3021471744 }, { "epoch": 0.1, "objective/train/docs_used": 1657764, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.8103880882263184, "objective/train/theoretical_loss": 3.318936222080042, "objective/train/tokens_used": 51588576, "theoretical_loss": 3.318936222080042, "tokens_seen": 3021537280 }, { "epoch": 0.1, "learning_rate": 0.00045434131736526946, "loss": 2.5186, "theoretical_loss": 3.318936222080042, "tokens_seen": 3021537280 }, { "epoch": 0.1, "learning_rate": 0.00045423438836612493, "loss": 2.619, "theoretical_loss": 3.318930695393271, "tokens_seen": 3021602816 }, { "epoch": 0.1, "learning_rate": 0.00045412745936698035, "loss": 2.6483, "theoretical_loss": 3.3189251688599297, "tokens_seen": 3021668352 }, { "epoch": 0.1, "learning_rate": 0.00045402053036783577, "loss": 2.446, "theoretical_loss": 3.3189196424800116, "tokens_seen": 3021733888 }, { "epoch": 0.1, "learning_rate": 0.0004539136013686912, "loss": 2.2355, "theoretical_loss": 3.318914116253509, "tokens_seen": 3021799424 }, { "epoch": 0.1, "learning_rate": 0.00045380667236954666, "loss": 2.4651, "theoretical_loss": 3.3189085901804134, "tokens_seen": 3021864960 }, { "epoch": 0.1, "learning_rate": 0.000453699743370402, "loss": 2.3771, "theoretical_loss": 3.3189030642607182, "tokens_seen": 3021930496 }, { "epoch": 0.1, "learning_rate": 0.0004535928143712575, "loss": 2.2859, "theoretical_loss": 3.3188975384944155, "tokens_seen": 3021996032 }, { "epoch": 0.1, "learning_rate": 0.0004534858853721129, "loss": 2.4815, "theoretical_loss": 3.3188920128814976, "tokens_seen": 3022061568 }, { "epoch": 0.1, "learning_rate": 0.00045337895637296834, "loss": 2.4833, "theoretical_loss": 3.318886487421957, "tokens_seen": 3022127104 }, { "epoch": 0.1, "learning_rate": 0.00045327202737382375, "loss": 2.6793, "theoretical_loss": 3.3188809621157858, "tokens_seen": 3022192640 }, { "epoch": 0.1, "learning_rate": 0.00045316509837467923, "loss": 2.4563, "theoretical_loss": 3.318875436962977, "tokens_seen": 3022258176 }, { "epoch": 0.1, "learning_rate": 0.0004530581693755347, "loss": 2.5047, "theoretical_loss": 3.3188699119635228, "tokens_seen": 3022323712 }, { "epoch": 0.1, "learning_rate": 0.00045295124037639006, "loss": 2.7077, "theoretical_loss": 3.3188643871174155, "tokens_seen": 3022389248 }, { "epoch": 0.1, "learning_rate": 0.00045284431137724554, "loss": 2.5694, "theoretical_loss": 3.3188588624246473, "tokens_seen": 3022454784 }, { "epoch": 0.1, "learning_rate": 0.00045273738237810096, "loss": 2.4641, "theoretical_loss": 3.318853337885211, "tokens_seen": 3022520320 }, { "epoch": 0.1, "learning_rate": 0.0004526304533789564, "loss": 2.5008, "theoretical_loss": 3.318847813499099, "tokens_seen": 3022585856 }, { "epoch": 0.1, "learning_rate": 0.0004525235243798118, "loss": 2.5933, "theoretical_loss": 3.318842289266304, "tokens_seen": 3022651392 }, { "epoch": 0.1, "learning_rate": 0.00045241659538066727, "loss": 2.7056, "theoretical_loss": 3.318836765186817, "tokens_seen": 3022716928 }, { "epoch": 0.1, "learning_rate": 0.00045230966638152263, "loss": 2.5471, "theoretical_loss": 3.3188312412606327, "tokens_seen": 3022782464 }, { "epoch": 0.1, "learning_rate": 0.0004522027373823781, "loss": 2.6295, "theoretical_loss": 3.3188257174877416, "tokens_seen": 3022848000 }, { "epoch": 0.1, "learning_rate": 0.0004520958083832336, "loss": 2.5878, "theoretical_loss": 3.3188201938681368, "tokens_seen": 3022913536 }, { "epoch": 0.11, "learning_rate": 0.00045198887938408894, "loss": 2.7199, "theoretical_loss": 3.3188146704018107, "tokens_seen": 3022979072 }, { "epoch": 0.11, "learning_rate": 0.0004518819503849444, "loss": 2.441, "theoretical_loss": 3.318809147088756, "tokens_seen": 3023044608 }, { "epoch": 0.11, "learning_rate": 0.00045177502138579983, "loss": 2.5828, "theoretical_loss": 3.318803623928965, "tokens_seen": 3023110144 }, { "epoch": 0.11, "objective/train/docs_used": 1658338, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.4892239570617676, "objective/train/theoretical_loss": 3.3187981009224297, "objective/train/tokens_used": 53226976, "theoretical_loss": 3.3187981009224297, "tokens_seen": 3023175680 }, { "epoch": 0.11, "learning_rate": 0.0004516680923866553, "loss": 2.4785, "theoretical_loss": 3.3187981009224297, "tokens_seen": 3023175680 }, { "epoch": 0.11, "learning_rate": 0.00045156116338751067, "loss": 2.4167, "theoretical_loss": 3.318792578069143, "tokens_seen": 3023241216 }, { "epoch": 0.11, "learning_rate": 0.00045145423438836614, "loss": 2.4702, "theoretical_loss": 3.3187870553690972, "tokens_seen": 3023306752 }, { "epoch": 0.11, "learning_rate": 0.00045134730538922156, "loss": 2.3209, "theoretical_loss": 3.3187815328222845, "tokens_seen": 3023372288 }, { "epoch": 0.11, "learning_rate": 0.000451240376390077, "loss": 2.5124, "theoretical_loss": 3.3187760104286976, "tokens_seen": 3023437824 }, { "epoch": 0.11, "learning_rate": 0.00045113344739093245, "loss": 2.5166, "theoretical_loss": 3.3187704881883286, "tokens_seen": 3023503360 }, { "epoch": 0.11, "learning_rate": 0.00045102651839178787, "loss": 2.6266, "theoretical_loss": 3.3187649661011704, "tokens_seen": 3023568896 }, { "epoch": 0.11, "learning_rate": 0.0004509195893926433, "loss": 2.6106, "theoretical_loss": 3.3187594441672155, "tokens_seen": 3023634432 }, { "epoch": 0.11, "learning_rate": 0.0004508126603934987, "loss": 2.768, "theoretical_loss": 3.3187539223864557, "tokens_seen": 3023699968 }, { "epoch": 0.11, "learning_rate": 0.0004507057313943542, "loss": 2.5288, "theoretical_loss": 3.3187484007588837, "tokens_seen": 3023765504 }, { "epoch": 0.11, "learning_rate": 0.0004505988023952096, "loss": 2.4277, "theoretical_loss": 3.318742879284492, "tokens_seen": 3023831040 }, { "epoch": 0.11, "learning_rate": 0.000450491873396065, "loss": 2.7191, "theoretical_loss": 3.3187373579632733, "tokens_seen": 3023896576 }, { "epoch": 0.11, "learning_rate": 0.00045038494439692044, "loss": 2.5977, "theoretical_loss": 3.3187318367952194, "tokens_seen": 3023962112 }, { "epoch": 0.11, "learning_rate": 0.0004502780153977759, "loss": 2.5787, "theoretical_loss": 3.3187263157803235, "tokens_seen": 3024027648 }, { "epoch": 0.11, "learning_rate": 0.00045017108639863133, "loss": 2.7877, "theoretical_loss": 3.318720794918577, "tokens_seen": 3024093184 }, { "epoch": 0.11, "learning_rate": 0.00045006415739948675, "loss": 2.6564, "theoretical_loss": 3.3187152742099735, "tokens_seen": 3024158720 }, { "epoch": 0.11, "learning_rate": 0.0004499572284003422, "loss": 2.7045, "theoretical_loss": 3.3187097536545047, "tokens_seen": 3024224256 }, { "epoch": 0.11, "learning_rate": 0.0004498502994011976, "loss": 2.6133, "theoretical_loss": 3.3187042332521632, "tokens_seen": 3024289792 }, { "epoch": 0.11, "learning_rate": 0.00044974337040205306, "loss": 2.4414, "theoretical_loss": 3.3186987130029415, "tokens_seen": 3024355328 }, { "epoch": 0.11, "learning_rate": 0.0004496364414029085, "loss": 2.6137, "theoretical_loss": 3.318693192906832, "tokens_seen": 3024420864 }, { "epoch": 0.11, "learning_rate": 0.0004495295124037639, "loss": 2.5737, "theoretical_loss": 3.3186876729638266, "tokens_seen": 3024486400 }, { "epoch": 0.11, "learning_rate": 0.0004494225834046193, "loss": 2.6552, "theoretical_loss": 3.318682153173919, "tokens_seen": 3024551936 }, { "epoch": 0.11, "learning_rate": 0.0004493156544054748, "loss": 2.5312, "theoretical_loss": 3.3186766335371005, "tokens_seen": 3024617472 }, { "epoch": 0.11, "learning_rate": 0.00044920872540633026, "loss": 2.5598, "theoretical_loss": 3.3186711140533642, "tokens_seen": 3024683008 }, { "epoch": 0.11, "learning_rate": 0.0004491017964071856, "loss": 2.6287, "theoretical_loss": 3.318665594722702, "tokens_seen": 3024748544 }, { "epoch": 0.11, "objective/train/docs_used": 1659421, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.7263076305389404, "objective/train/theoretical_loss": 3.3186600755451066, "objective/train/tokens_used": 54865376, "theoretical_loss": 3.3186600755451066, "tokens_seen": 3024814080 }, { "epoch": 0.11, "learning_rate": 0.0004489948674080411, "loss": 2.5863, "theoretical_loss": 3.3186600755451066, "tokens_seen": 3024814080 }, { "epoch": 0.11, "learning_rate": 0.0004488879384088965, "loss": 2.4847, "theoretical_loss": 3.3186545565205705, "tokens_seen": 3024879616 }, { "epoch": 0.11, "learning_rate": 0.00044878100940975193, "loss": 2.6246, "theoretical_loss": 3.3186490376490863, "tokens_seen": 3024945152 }, { "epoch": 0.11, "learning_rate": 0.00044867408041060735, "loss": 2.4603, "theoretical_loss": 3.318643518930646, "tokens_seen": 3025010688 }, { "epoch": 0.11, "learning_rate": 0.0004485671514114628, "loss": 2.5178, "theoretical_loss": 3.318638000365242, "tokens_seen": 3025076224 }, { "epoch": 0.11, "learning_rate": 0.0004484602224123182, "loss": 2.5561, "theoretical_loss": 3.3186324819528674, "tokens_seen": 3025141760 }, { "epoch": 0.11, "learning_rate": 0.00044835329341317366, "loss": 2.5152, "theoretical_loss": 3.318626963693514, "tokens_seen": 3025207296 }, { "epoch": 0.11, "learning_rate": 0.0004482463644140291, "loss": 2.4108, "theoretical_loss": 3.318621445587175, "tokens_seen": 3025272832 }, { "epoch": 0.11, "learning_rate": 0.0004481394354148845, "loss": 2.7335, "theoretical_loss": 3.3186159276338416, "tokens_seen": 3025338368 }, { "epoch": 0.11, "learning_rate": 0.00044803250641573997, "loss": 2.4303, "theoretical_loss": 3.3186104098335076, "tokens_seen": 3025403904 }, { "epoch": 0.11, "learning_rate": 0.0004479255774165954, "loss": 2.3138, "theoretical_loss": 3.3186048921861646, "tokens_seen": 3025469440 }, { "epoch": 0.11, "learning_rate": 0.00044781864841745086, "loss": 2.4013, "theoretical_loss": 3.318599374691805, "tokens_seen": 3025534976 }, { "epoch": 0.11, "learning_rate": 0.00044771171941830623, "loss": 2.3985, "theoretical_loss": 3.3185938573504217, "tokens_seen": 3025600512 }, { "epoch": 0.11, "learning_rate": 0.0004476047904191617, "loss": 2.6061, "theoretical_loss": 3.318588340162007, "tokens_seen": 3025666048 }, { "epoch": 0.11, "learning_rate": 0.0004474978614200171, "loss": 2.659, "theoretical_loss": 3.3185828231265533, "tokens_seen": 3025731584 }, { "epoch": 0.11, "learning_rate": 0.00044739093242087254, "loss": 2.2818, "theoretical_loss": 3.3185773062440527, "tokens_seen": 3025797120 }, { "epoch": 0.11, "learning_rate": 0.00044728400342172796, "loss": 2.4729, "theoretical_loss": 3.3185717895144986, "tokens_seen": 3025862656 }, { "epoch": 0.11, "learning_rate": 0.00044717707442258343, "loss": 2.6619, "theoretical_loss": 3.3185662729378826, "tokens_seen": 3025928192 }, { "epoch": 0.11, "learning_rate": 0.00044707014542343885, "loss": 2.5107, "theoretical_loss": 3.3185607565141972, "tokens_seen": 3025993728 }, { "epoch": 0.12, "learning_rate": 0.00044696321642429427, "loss": 2.3266, "theoretical_loss": 3.318555240243435, "tokens_seen": 3026059264 }, { "epoch": 0.12, "learning_rate": 0.00044685628742514974, "loss": 2.6211, "theoretical_loss": 3.318549724125589, "tokens_seen": 3026124800 }, { "epoch": 0.12, "learning_rate": 0.0004467493584260051, "loss": 2.3811, "theoretical_loss": 3.3185442081606507, "tokens_seen": 3026190336 }, { "epoch": 0.12, "learning_rate": 0.0004466424294268606, "loss": 2.5501, "theoretical_loss": 3.318538692348613, "tokens_seen": 3026255872 }, { "epoch": 0.12, "learning_rate": 0.000446535500427716, "loss": 2.7668, "theoretical_loss": 3.3185331766894683, "tokens_seen": 3026321408 }, { "epoch": 0.12, "learning_rate": 0.00044642857142857147, "loss": 2.6323, "theoretical_loss": 3.318527661183209, "tokens_seen": 3026386944 }, { "epoch": 0.12, "objective/train/docs_used": 1660021, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 3.0193262100219727, "objective/train/theoretical_loss": 3.318522145829828, "objective/train/tokens_used": 56503776, "theoretical_loss": 3.318522145829828, "tokens_seen": 3026452480 }, { "epoch": 0.12, "learning_rate": 0.00044632164242942683, "loss": 2.8896, "theoretical_loss": 3.318522145829828, "tokens_seen": 3026452480 }, { "epoch": 0.12, "learning_rate": 0.0004462147134302823, "loss": 2.4917, "theoretical_loss": 3.318516630629317, "tokens_seen": 3026518016 }, { "epoch": 0.12, "learning_rate": 0.0004461077844311378, "loss": 2.3977, "theoretical_loss": 3.318511115581669, "tokens_seen": 3026583552 }, { "epoch": 0.12, "learning_rate": 0.00044600085543199314, "loss": 2.5538, "theoretical_loss": 3.3185056006868763, "tokens_seen": 3026649088 }, { "epoch": 0.12, "learning_rate": 0.0004458939264328486, "loss": 2.41, "theoretical_loss": 3.3185000859449314, "tokens_seen": 3026714624 }, { "epoch": 0.12, "learning_rate": 0.00044578699743370403, "loss": 2.4, "theoretical_loss": 3.318494571355827, "tokens_seen": 3026780160 }, { "epoch": 0.12, "learning_rate": 0.00044568006843455945, "loss": 2.7303, "theoretical_loss": 3.3184890569195544, "tokens_seen": 3026845696 }, { "epoch": 0.12, "learning_rate": 0.00044557313943541487, "loss": 2.4075, "theoretical_loss": 3.3184835426361077, "tokens_seen": 3026911232 }, { "epoch": 0.12, "learning_rate": 0.00044546621043627034, "loss": 2.4872, "theoretical_loss": 3.3184780285054782, "tokens_seen": 3026976768 }, { "epoch": 0.12, "learning_rate": 0.00044535928143712576, "loss": 2.4217, "theoretical_loss": 3.318472514527659, "tokens_seen": 3027042304 }, { "epoch": 0.12, "learning_rate": 0.0004452523524379812, "loss": 2.6315, "theoretical_loss": 3.318467000702642, "tokens_seen": 3027107840 }, { "epoch": 0.12, "learning_rate": 0.0004451454234388366, "loss": 2.553, "theoretical_loss": 3.31846148703042, "tokens_seen": 3027173376 }, { "epoch": 0.12, "learning_rate": 0.0004450384944396921, "loss": 2.3969, "theoretical_loss": 3.3184559735109853, "tokens_seen": 3027238912 }, { "epoch": 0.12, "learning_rate": 0.0004449315654405475, "loss": 2.4959, "theoretical_loss": 3.3184504601443305, "tokens_seen": 3027304448 }, { "epoch": 0.12, "learning_rate": 0.0004448246364414029, "loss": 2.7329, "theoretical_loss": 3.3184449469304482, "tokens_seen": 3027369984 }, { "epoch": 0.12, "learning_rate": 0.0004447177074422584, "loss": 2.6387, "theoretical_loss": 3.318439433869331, "tokens_seen": 3027435520 }, { "epoch": 0.12, "learning_rate": 0.00044461077844311375, "loss": 2.5593, "theoretical_loss": 3.3184339209609703, "tokens_seen": 3027501056 }, { "epoch": 0.12, "learning_rate": 0.0004445038494439692, "loss": 2.5025, "theoretical_loss": 3.3184284082053597, "tokens_seen": 3027566592 }, { "epoch": 0.12, "learning_rate": 0.00044439692044482464, "loss": 2.8454, "theoretical_loss": 3.318422895602491, "tokens_seen": 3027632128 }, { "epoch": 0.12, "learning_rate": 0.00044428999144568006, "loss": 2.5777, "theoretical_loss": 3.318417383152357, "tokens_seen": 3027697664 }, { "epoch": 0.12, "learning_rate": 0.0004441830624465355, "loss": 2.5114, "theoretical_loss": 3.31841187085495, "tokens_seen": 3027763200 }, { "epoch": 0.12, "learning_rate": 0.00044407613344739095, "loss": 2.4808, "theoretical_loss": 3.3184063587102632, "tokens_seen": 3027828736 }, { "epoch": 0.12, "learning_rate": 0.0004439692044482464, "loss": 2.6478, "theoretical_loss": 3.318400846718288, "tokens_seen": 3027894272 }, { "epoch": 0.12, "learning_rate": 0.0004438622754491018, "loss": 2.3825, "theoretical_loss": 3.318395334879017, "tokens_seen": 3027959808 }, { "epoch": 0.12, "learning_rate": 0.00044375534644995726, "loss": 2.4853, "theoretical_loss": 3.318389823192443, "tokens_seen": 3028025344 }, { "epoch": 0.12, "objective/train/docs_used": 1661280, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.6410443782806396, "objective/train/theoretical_loss": 3.3183843116585585, "objective/train/tokens_used": 58142176, "theoretical_loss": 3.3183843116585585, "tokens_seen": 3028090880 }, { "epoch": 0.12, "learning_rate": 0.0004436484174508127, "loss": 2.469, "theoretical_loss": 3.3183843116585585, "tokens_seen": 3028090880 }, { "epoch": 0.12, "learning_rate": 0.0004435414884516681, "loss": 2.7035, "theoretical_loss": 3.318378800277356, "tokens_seen": 3028156416 }, { "epoch": 0.12, "learning_rate": 0.0004434345594525235, "loss": 2.5603, "theoretical_loss": 3.318373289048828, "tokens_seen": 3028221952 }, { "epoch": 0.12, "learning_rate": 0.000443327630453379, "loss": 2.5599, "theoretical_loss": 3.3183677779729663, "tokens_seen": 3028287488 }, { "epoch": 0.12, "learning_rate": 0.00044322070145423435, "loss": 2.5114, "theoretical_loss": 3.318362267049764, "tokens_seen": 3028353024 }, { "epoch": 0.12, "learning_rate": 0.0004431137724550898, "loss": 2.6164, "theoretical_loss": 3.3183567562792136, "tokens_seen": 3028418560 }, { "epoch": 0.12, "learning_rate": 0.0004430068434559453, "loss": 2.6386, "theoretical_loss": 3.318351245661307, "tokens_seen": 3028484096 }, { "epoch": 0.12, "learning_rate": 0.00044289991445680066, "loss": 2.6347, "theoretical_loss": 3.3183457351960377, "tokens_seen": 3028549632 }, { "epoch": 0.12, "learning_rate": 0.00044279298545765614, "loss": 2.6369, "theoretical_loss": 3.3183402248833973, "tokens_seen": 3028615168 }, { "epoch": 0.12, "learning_rate": 0.00044268605645851155, "loss": 2.6041, "theoretical_loss": 3.3183347147233784, "tokens_seen": 3028680704 }, { "epoch": 0.12, "learning_rate": 0.00044257912745936703, "loss": 2.4263, "theoretical_loss": 3.318329204715974, "tokens_seen": 3028746240 }, { "epoch": 0.12, "learning_rate": 0.0004424721984602224, "loss": 2.5039, "theoretical_loss": 3.3183236948611756, "tokens_seen": 3028811776 }, { "epoch": 0.12, "learning_rate": 0.00044236526946107786, "loss": 2.6112, "theoretical_loss": 3.3183181851589763, "tokens_seen": 3028877312 }, { "epoch": 0.12, "learning_rate": 0.0004422583404619333, "loss": 2.512, "theoretical_loss": 3.3183126756093686, "tokens_seen": 3028942848 }, { "epoch": 0.12, "learning_rate": 0.0004421514114627887, "loss": 2.4492, "theoretical_loss": 3.318307166212345, "tokens_seen": 3029008384 }, { "epoch": 0.12, "learning_rate": 0.0004420444824636442, "loss": 2.7508, "theoretical_loss": 3.318301656967898, "tokens_seen": 3029073920 }, { "epoch": 0.13, "learning_rate": 0.0004419375534644996, "loss": 2.6316, "theoretical_loss": 3.318296147876019, "tokens_seen": 3029139456 }, { "epoch": 0.13, "learning_rate": 0.000441830624465355, "loss": 2.6729, "theoretical_loss": 3.3182906389367024, "tokens_seen": 3029204992 }, { "epoch": 0.13, "learning_rate": 0.00044172369546621043, "loss": 2.7944, "theoretical_loss": 3.318285130149939, "tokens_seen": 3029270528 }, { "epoch": 0.13, "learning_rate": 0.0004416167664670659, "loss": 2.557, "theoretical_loss": 3.3182796215157224, "tokens_seen": 3029336064 }, { "epoch": 0.13, "learning_rate": 0.00044150983746792127, "loss": 2.5116, "theoretical_loss": 3.3182741130340445, "tokens_seen": 3029401600 }, { "epoch": 0.13, "learning_rate": 0.00044140290846877674, "loss": 2.3178, "theoretical_loss": 3.318268604704898, "tokens_seen": 3029467136 }, { "epoch": 0.13, "learning_rate": 0.00044129597946963216, "loss": 2.4742, "theoretical_loss": 3.318263096528275, "tokens_seen": 3029532672 }, { "epoch": 0.13, "learning_rate": 0.00044118905047048763, "loss": 2.4291, "theoretical_loss": 3.318257588504168, "tokens_seen": 3029598208 }, { "epoch": 0.13, "learning_rate": 0.000441082121471343, "loss": 2.3404, "theoretical_loss": 3.31825208063257, "tokens_seen": 3029663744 }, { "epoch": 0.13, "objective/train/docs_used": 1661975, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.466115951538086, "objective/train/theoretical_loss": 3.318246572913474, "objective/train/tokens_used": 59780576, "theoretical_loss": 3.318246572913474, "tokens_seen": 3029729280 }, { "epoch": 0.13, "learning_rate": 0.00044097519247219847, "loss": 2.4917, "theoretical_loss": 3.318246572913474, "tokens_seen": 3029729280 }, { "epoch": 0.13, "learning_rate": 0.00044086826347305394, "loss": 2.4753, "theoretical_loss": 3.3182410653468706, "tokens_seen": 3029794816 }, { "epoch": 0.13, "learning_rate": 0.0004407613344739093, "loss": 2.6818, "theoretical_loss": 3.318235557932754, "tokens_seen": 3029860352 }, { "epoch": 0.13, "learning_rate": 0.0004406544054747648, "loss": 2.5633, "theoretical_loss": 3.3182300506711155, "tokens_seen": 3029925888 }, { "epoch": 0.13, "learning_rate": 0.0004405474764756202, "loss": 2.3311, "theoretical_loss": 3.318224543561948, "tokens_seen": 3029991424 }, { "epoch": 0.13, "learning_rate": 0.0004404405474764756, "loss": 2.323, "theoretical_loss": 3.318219036605245, "tokens_seen": 3030056960 }, { "epoch": 0.13, "learning_rate": 0.00044033361847733104, "loss": 2.4226, "theoretical_loss": 3.3182135298009974, "tokens_seen": 3030122496 }, { "epoch": 0.13, "learning_rate": 0.0004402266894781865, "loss": 2.2801, "theoretical_loss": 3.3182080231491984, "tokens_seen": 3030188032 }, { "epoch": 0.13, "learning_rate": 0.00044011976047904193, "loss": 2.8268, "theoretical_loss": 3.3182025166498406, "tokens_seen": 3030253568 }, { "epoch": 0.13, "learning_rate": 0.00044001283147989735, "loss": 2.6459, "theoretical_loss": 3.3181970103029164, "tokens_seen": 3030319104 }, { "epoch": 0.13, "learning_rate": 0.0004399059024807528, "loss": 2.4968, "theoretical_loss": 3.3181915041084182, "tokens_seen": 3030384640 }, { "epoch": 0.13, "learning_rate": 0.00043979897348160824, "loss": 2.5702, "theoretical_loss": 3.3181859980663386, "tokens_seen": 3030450176 }, { "epoch": 0.13, "learning_rate": 0.00043969204448246366, "loss": 2.4552, "theoretical_loss": 3.3181804921766695, "tokens_seen": 3030515712 }, { "epoch": 0.13, "learning_rate": 0.0004395851154833191, "loss": 2.3615, "theoretical_loss": 3.3181749864394043, "tokens_seen": 3030581248 }, { "epoch": 0.13, "learning_rate": 0.00043947818648417455, "loss": 2.5737, "theoretical_loss": 3.318169480854535, "tokens_seen": 3030646784 }, { "epoch": 0.13, "learning_rate": 0.0004393712574850299, "loss": 2.6029, "theoretical_loss": 3.318163975422054, "tokens_seen": 3030712320 }, { "epoch": 0.13, "learning_rate": 0.0004392643284858854, "loss": 2.4033, "theoretical_loss": 3.318158470141954, "tokens_seen": 3030777856 }, { "epoch": 0.13, "learning_rate": 0.0004391573994867408, "loss": 2.5225, "theoretical_loss": 3.318152965014227, "tokens_seen": 3030843392 }, { "epoch": 0.13, "learning_rate": 0.0004390504704875962, "loss": 2.4062, "theoretical_loss": 3.3181474600388667, "tokens_seen": 3030908928 }, { "epoch": 0.13, "learning_rate": 0.0004389435414884517, "loss": 2.5334, "theoretical_loss": 3.3181419552158644, "tokens_seen": 3030974464 }, { "epoch": 0.13, "learning_rate": 0.0004388366124893071, "loss": 2.2213, "theoretical_loss": 3.318136450545213, "tokens_seen": 3031040000 }, { "epoch": 0.13, "learning_rate": 0.0004387296834901626, "loss": 2.5278, "theoretical_loss": 3.318130946026905, "tokens_seen": 3031105536 }, { "epoch": 0.13, "learning_rate": 0.00043862275449101795, "loss": 2.5084, "theoretical_loss": 3.318125441660933, "tokens_seen": 3031171072 }, { "epoch": 0.13, "learning_rate": 0.0004385158254918734, "loss": 2.6303, "theoretical_loss": 3.318119937447289, "tokens_seen": 3031236608 }, { "epoch": 0.13, "learning_rate": 0.00043840889649272884, "loss": 2.5961, "theoretical_loss": 3.318114433385966, "tokens_seen": 3031302144 }, { "epoch": 0.13, "objective/train/docs_used": 1662642, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.3627991676330566, "objective/train/theoretical_loss": 3.3181089294769563, "objective/train/tokens_used": 61418976, "theoretical_loss": 3.3181089294769563, "tokens_seen": 3031367680 }, { "epoch": 0.13, "learning_rate": 0.00043830196749358426, "loss": 2.6984, "theoretical_loss": 3.3181089294769563, "tokens_seen": 3031367680 }, { "epoch": 0.13, "learning_rate": 0.0004381950384944397, "loss": 2.7248, "theoretical_loss": 3.3181034257202526, "tokens_seen": 3031433216 }, { "epoch": 0.13, "learning_rate": 0.00043808810949529515, "loss": 2.6676, "theoretical_loss": 3.318097922115847, "tokens_seen": 3031498752 }, { "epoch": 0.13, "learning_rate": 0.00043798118049615057, "loss": 2.4558, "theoretical_loss": 3.318092418663732, "tokens_seen": 3031564288 }, { "epoch": 0.13, "learning_rate": 0.000437874251497006, "loss": 2.3336, "theoretical_loss": 3.318086915363901, "tokens_seen": 3031629824 }, { "epoch": 0.13, "learning_rate": 0.00043776732249786146, "loss": 2.5785, "theoretical_loss": 3.3180814122163453, "tokens_seen": 3031695360 }, { "epoch": 0.13, "learning_rate": 0.0004376603934987168, "loss": 2.3364, "theoretical_loss": 3.318075909221058, "tokens_seen": 3031760896 }, { "epoch": 0.13, "learning_rate": 0.0004375534644995723, "loss": 2.587, "theoretical_loss": 3.3180704063780313, "tokens_seen": 3031826432 }, { "epoch": 0.13, "learning_rate": 0.0004374465355004277, "loss": 2.4558, "theoretical_loss": 3.3180649036872585, "tokens_seen": 3031891968 }, { "epoch": 0.13, "learning_rate": 0.0004373396065012832, "loss": 2.3549, "theoretical_loss": 3.318059401148731, "tokens_seen": 3031957504 }, { "epoch": 0.13, "learning_rate": 0.00043723267750213856, "loss": 2.5263, "theoretical_loss": 3.3180538987624417, "tokens_seen": 3032023040 }, { "epoch": 0.13, "learning_rate": 0.00043712574850299403, "loss": 2.5736, "theoretical_loss": 3.3180483965283836, "tokens_seen": 3032088576 }, { "epoch": 0.13, "learning_rate": 0.00043701881950384945, "loss": 2.7003, "theoretical_loss": 3.3180428944465485, "tokens_seen": 3032154112 }, { "epoch": 0.14, "learning_rate": 0.00043691189050470487, "loss": 2.654, "theoretical_loss": 3.318037392516929, "tokens_seen": 3032219648 }, { "epoch": 0.14, "learning_rate": 0.00043680496150556034, "loss": 2.6114, "theoretical_loss": 3.318031890739518, "tokens_seen": 3032285184 }, { "epoch": 0.14, "learning_rate": 0.00043669803250641576, "loss": 2.5595, "theoretical_loss": 3.318026389114308, "tokens_seen": 3032350720 }, { "epoch": 0.14, "learning_rate": 0.0004365911035072712, "loss": 2.7492, "theoretical_loss": 3.318020887641291, "tokens_seen": 3032416256 }, { "epoch": 0.14, "learning_rate": 0.0004364841745081266, "loss": 2.5468, "theoretical_loss": 3.3180153863204596, "tokens_seen": 3032481792 }, { "epoch": 0.14, "learning_rate": 0.00043637724550898207, "loss": 2.7061, "theoretical_loss": 3.318009885151807, "tokens_seen": 3032547328 }, { "epoch": 0.14, "learning_rate": 0.00043627031650983743, "loss": 2.5121, "theoretical_loss": 3.3180043841353246, "tokens_seen": 3032612864 }, { "epoch": 0.14, "learning_rate": 0.0004361633875106929, "loss": 2.5887, "theoretical_loss": 3.317998883271006, "tokens_seen": 3032678400 }, { "epoch": 0.14, "learning_rate": 0.0004360564585115483, "loss": 2.5448, "theoretical_loss": 3.317993382558843, "tokens_seen": 3032743936 }, { "epoch": 0.14, "learning_rate": 0.0004359495295124038, "loss": 2.506, "theoretical_loss": 3.317987881998828, "tokens_seen": 3032809472 }, { "epoch": 0.14, "learning_rate": 0.0004358426005132592, "loss": 2.5888, "theoretical_loss": 3.317982381590954, "tokens_seen": 3032875008 }, { "epoch": 0.14, "learning_rate": 0.00043573567151411463, "loss": 2.6217, "theoretical_loss": 3.3179768813352135, "tokens_seen": 3032940544 }, { "epoch": 0.14, "objective/train/docs_used": 1663221, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.339816093444824, "objective/train/theoretical_loss": 3.3179713812315983, "objective/train/tokens_used": 63057376, "theoretical_loss": 3.3179713812315983, "tokens_seen": 3033006080 }, { "epoch": 0.14, "learning_rate": 0.0004356287425149701, "loss": 2.6192, "theoretical_loss": 3.3179713812315983, "tokens_seen": 3033006080 }, { "epoch": 0.14, "learning_rate": 0.00043552181351582547, "loss": 2.4596, "theoretical_loss": 3.317965881280102, "tokens_seen": 3033071616 }, { "epoch": 0.14, "learning_rate": 0.00043541488451668094, "loss": 2.3786, "theoretical_loss": 3.317960381480716, "tokens_seen": 3033137152 }, { "epoch": 0.14, "learning_rate": 0.00043530795551753636, "loss": 2.3844, "theoretical_loss": 3.3179548818334337, "tokens_seen": 3033202688 }, { "epoch": 0.14, "learning_rate": 0.0004352010265183918, "loss": 2.6781, "theoretical_loss": 3.317949382338247, "tokens_seen": 3033268224 }, { "epoch": 0.14, "learning_rate": 0.0004350940975192472, "loss": 2.6249, "theoretical_loss": 3.3179438829951486, "tokens_seen": 3033333760 }, { "epoch": 0.14, "learning_rate": 0.00043498716852010267, "loss": 2.7442, "theoretical_loss": 3.3179383838041314, "tokens_seen": 3033399296 }, { "epoch": 0.14, "learning_rate": 0.0004348802395209581, "loss": 2.5186, "theoretical_loss": 3.317932884765187, "tokens_seen": 3033464832 }, { "epoch": 0.14, "learning_rate": 0.0004347733105218135, "loss": 2.7132, "theoretical_loss": 3.317927385878309, "tokens_seen": 3033530368 }, { "epoch": 0.14, "learning_rate": 0.000434666381522669, "loss": 2.6596, "theoretical_loss": 3.317921887143489, "tokens_seen": 3033595904 }, { "epoch": 0.14, "learning_rate": 0.0004345594525235244, "loss": 2.5732, "theoretical_loss": 3.31791638856072, "tokens_seen": 3033661440 }, { "epoch": 0.14, "learning_rate": 0.0004344525235243798, "loss": 2.6813, "theoretical_loss": 3.3179108901299945, "tokens_seen": 3033726976 }, { "epoch": 0.14, "learning_rate": 0.00043434559452523524, "loss": 2.6586, "theoretical_loss": 3.317905391851305, "tokens_seen": 3033792512 }, { "epoch": 0.14, "learning_rate": 0.0004342386655260907, "loss": 2.4702, "theoretical_loss": 3.317899893724644, "tokens_seen": 3033858048 }, { "epoch": 0.14, "learning_rate": 0.0004341317365269461, "loss": 2.6286, "theoretical_loss": 3.3178943957500033, "tokens_seen": 3033923584 }, { "epoch": 0.14, "learning_rate": 0.00043402480752780155, "loss": 2.6536, "theoretical_loss": 3.3178888979273764, "tokens_seen": 3033989120 }, { "epoch": 0.14, "learning_rate": 0.000433917878528657, "loss": 2.4538, "theoretical_loss": 3.317883400256756, "tokens_seen": 3034054656 }, { "epoch": 0.14, "learning_rate": 0.0004338109495295124, "loss": 2.4945, "theoretical_loss": 3.3178779027381333, "tokens_seen": 3034120192 }, { "epoch": 0.14, "learning_rate": 0.00043370402053036786, "loss": 2.5167, "theoretical_loss": 3.3178724053715016, "tokens_seen": 3034185728 }, { "epoch": 0.14, "learning_rate": 0.0004335970915312233, "loss": 2.7934, "theoretical_loss": 3.3178669081568537, "tokens_seen": 3034251264 }, { "epoch": 0.14, "learning_rate": 0.00043349016253207875, "loss": 2.3483, "theoretical_loss": 3.3178614110941815, "tokens_seen": 3034316800 }, { "epoch": 0.14, "learning_rate": 0.0004333832335329341, "loss": 2.384, "theoretical_loss": 3.317855914183478, "tokens_seen": 3034382336 }, { "epoch": 0.14, "learning_rate": 0.0004332763045337896, "loss": 2.6092, "theoretical_loss": 3.3178504174247356, "tokens_seen": 3034447872 }, { "epoch": 0.14, "learning_rate": 0.000433169375534645, "loss": 2.8584, "theoretical_loss": 3.317844920817947, "tokens_seen": 3034513408 }, { "epoch": 0.14, "learning_rate": 0.0004330624465355004, "loss": 2.6498, "theoretical_loss": 3.317839424363104, "tokens_seen": 3034578944 }, { "epoch": 0.14, "objective/train/docs_used": 1664335, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.460336923599243, "objective/train/theoretical_loss": 3.3178339280602, "objective/train/tokens_used": 64695776, "theoretical_loss": 3.3178339280602, "tokens_seen": 3034644480 }, { "epoch": 0.14, "learning_rate": 0.00043295551753635584, "loss": 2.4474, "theoretical_loss": 3.3178339280602, "tokens_seen": 3034644480 }, { "epoch": 0.14, "learning_rate": 0.0004328485885372113, "loss": 2.5976, "theoretical_loss": 3.317828431909227, "tokens_seen": 3034710016 }, { "epoch": 0.14, "learning_rate": 0.00043274165953806673, "loss": 2.2452, "theoretical_loss": 3.3178229359101774, "tokens_seen": 3034775552 }, { "epoch": 0.14, "learning_rate": 0.00043263473053892215, "loss": 2.5553, "theoretical_loss": 3.3178174400630445, "tokens_seen": 3034841088 }, { "epoch": 0.14, "learning_rate": 0.0004325278015397776, "loss": 2.4949, "theoretical_loss": 3.31781194436782, "tokens_seen": 3034906624 }, { "epoch": 0.14, "learning_rate": 0.000432420872540633, "loss": 2.7255, "theoretical_loss": 3.3178064488244967, "tokens_seen": 3034972160 }, { "epoch": 0.14, "learning_rate": 0.00043231394354148846, "loss": 2.4908, "theoretical_loss": 3.317800953433067, "tokens_seen": 3035037696 }, { "epoch": 0.14, "learning_rate": 0.0004322070145423439, "loss": 2.7668, "theoretical_loss": 3.3177954581935234, "tokens_seen": 3035103232 }, { "epoch": 0.14, "learning_rate": 0.00043210008554319935, "loss": 2.6495, "theoretical_loss": 3.317789963105859, "tokens_seen": 3035168768 }, { "epoch": 0.14, "learning_rate": 0.0004319931565440547, "loss": 2.6374, "theoretical_loss": 3.317784468170066, "tokens_seen": 3035234304 }, { "epoch": 0.15, "learning_rate": 0.0004318862275449102, "loss": 2.5718, "theoretical_loss": 3.3177789733861363, "tokens_seen": 3035299840 }, { "epoch": 0.15, "learning_rate": 0.00043177929854576567, "loss": 2.6787, "theoretical_loss": 3.317773478754063, "tokens_seen": 3035365376 }, { "epoch": 0.15, "learning_rate": 0.00043167236954662103, "loss": 2.6101, "theoretical_loss": 3.3177679842738383, "tokens_seen": 3035430912 }, { "epoch": 0.15, "learning_rate": 0.0004315654405474765, "loss": 2.7885, "theoretical_loss": 3.317762489945456, "tokens_seen": 3035496448 }, { "epoch": 0.15, "learning_rate": 0.0004314585115483319, "loss": 2.4024, "theoretical_loss": 3.3177569957689066, "tokens_seen": 3035561984 }, { "epoch": 0.15, "learning_rate": 0.00043135158254918734, "loss": 2.5602, "theoretical_loss": 3.3177515017441843, "tokens_seen": 3035627520 }, { "epoch": 0.15, "learning_rate": 0.00043124465355004276, "loss": 2.7748, "theoretical_loss": 3.3177460078712806, "tokens_seen": 3035693056 }, { "epoch": 0.15, "learning_rate": 0.00043113772455089823, "loss": 2.5992, "theoretical_loss": 3.3177405141501883, "tokens_seen": 3035758592 }, { "epoch": 0.15, "learning_rate": 0.0004310307955517536, "loss": 2.6069, "theoretical_loss": 3.3177350205809004, "tokens_seen": 3035824128 }, { "epoch": 0.15, "learning_rate": 0.00043092386655260907, "loss": 2.5898, "theoretical_loss": 3.317729527163409, "tokens_seen": 3035889664 }, { "epoch": 0.15, "learning_rate": 0.00043081693755346454, "loss": 2.7153, "theoretical_loss": 3.317724033897706, "tokens_seen": 3035955200 }, { "epoch": 0.15, "learning_rate": 0.00043071000855431996, "loss": 2.6276, "theoretical_loss": 3.3177185407837855, "tokens_seen": 3036020736 }, { "epoch": 0.15, "learning_rate": 0.0004306030795551754, "loss": 2.5747, "theoretical_loss": 3.3177130478216386, "tokens_seen": 3036086272 }, { "epoch": 0.15, "learning_rate": 0.0004304961505560308, "loss": 2.921, "theoretical_loss": 3.3177075550112587, "tokens_seen": 3036151808 }, { "epoch": 0.15, "learning_rate": 0.00043038922155688627, "loss": 2.4716, "theoretical_loss": 3.3177020623526374, "tokens_seen": 3036217344 }, { "epoch": 0.15, "objective/train/docs_used": 1665364, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.702648878097534, "objective/train/theoretical_loss": 3.3176965698457686, "objective/train/tokens_used": 66334176, "theoretical_loss": 3.3176965698457686, "tokens_seen": 3036282880 }, { "epoch": 0.15, "learning_rate": 0.00043028229255774163, "loss": 2.8016, "theoretical_loss": 3.3176965698457686, "tokens_seen": 3036282880 }, { "epoch": 0.15, "learning_rate": 0.0004301753635585971, "loss": 2.6286, "theoretical_loss": 3.3176910774906436, "tokens_seen": 3036348416 }, { "epoch": 0.15, "learning_rate": 0.0004300684345594525, "loss": 2.5836, "theoretical_loss": 3.3176855852872555, "tokens_seen": 3036413952 }, { "epoch": 0.15, "learning_rate": 0.00042996150556030794, "loss": 2.7408, "theoretical_loss": 3.3176800932355968, "tokens_seen": 3036479488 }, { "epoch": 0.15, "learning_rate": 0.0004298545765611634, "loss": 2.5826, "theoretical_loss": 3.3176746013356597, "tokens_seen": 3036545024 }, { "epoch": 0.15, "learning_rate": 0.00042974764756201884, "loss": 2.258, "theoretical_loss": 3.3176691095874373, "tokens_seen": 3036610560 }, { "epoch": 0.15, "learning_rate": 0.00042964071856287425, "loss": 2.5244, "theoretical_loss": 3.317663617990922, "tokens_seen": 3036676096 }, { "epoch": 0.15, "learning_rate": 0.0004295337895637297, "loss": 2.4518, "theoretical_loss": 3.317658126546106, "tokens_seen": 3036741632 }, { "epoch": 0.15, "learning_rate": 0.00042942686056458515, "loss": 2.6188, "theoretical_loss": 3.3176526352529816, "tokens_seen": 3036807168 }, { "epoch": 0.15, "learning_rate": 0.00042931993156544056, "loss": 2.5812, "theoretical_loss": 3.3176471441115423, "tokens_seen": 3036872704 }, { "epoch": 0.15, "learning_rate": 0.000429213002566296, "loss": 2.5719, "theoretical_loss": 3.31764165312178, "tokens_seen": 3036938240 }, { "epoch": 0.15, "learning_rate": 0.0004291060735671514, "loss": 2.6047, "theoretical_loss": 3.317636162283687, "tokens_seen": 3037003776 }, { "epoch": 0.15, "learning_rate": 0.0004289991445680069, "loss": 2.4401, "theoretical_loss": 3.3176306715972563, "tokens_seen": 3037069312 }, { "epoch": 0.15, "learning_rate": 0.00042889221556886224, "loss": 2.5756, "theoretical_loss": 3.31762518106248, "tokens_seen": 3037134848 }, { "epoch": 0.15, "learning_rate": 0.0004287852865697177, "loss": 2.5878, "theoretical_loss": 3.3176196906793516, "tokens_seen": 3037200384 }, { "epoch": 0.15, "learning_rate": 0.0004286783575705732, "loss": 2.5203, "theoretical_loss": 3.317614200447862, "tokens_seen": 3037265920 }, { "epoch": 0.15, "learning_rate": 0.00042857142857142855, "loss": 2.4597, "theoretical_loss": 3.3176087103680056, "tokens_seen": 3037331456 }, { "epoch": 0.15, "learning_rate": 0.000428464499572284, "loss": 2.4774, "theoretical_loss": 3.3176032204397736, "tokens_seen": 3037396992 }, { "epoch": 0.15, "learning_rate": 0.00042835757057313944, "loss": 2.6599, "theoretical_loss": 3.3175977306631594, "tokens_seen": 3037462528 }, { "epoch": 0.15, "learning_rate": 0.0004282506415739949, "loss": 2.6216, "theoretical_loss": 3.3175922410381546, "tokens_seen": 3037528064 }, { "epoch": 0.15, "learning_rate": 0.0004281437125748503, "loss": 2.6732, "theoretical_loss": 3.3175867515647526, "tokens_seen": 3037593600 }, { "epoch": 0.15, "learning_rate": 0.00042803678357570575, "loss": 2.8955, "theoretical_loss": 3.3175812622429453, "tokens_seen": 3037659136 }, { "epoch": 0.15, "learning_rate": 0.00042792985457656117, "loss": 2.6119, "theoretical_loss": 3.317575773072726, "tokens_seen": 3037724672 }, { "epoch": 0.15, "learning_rate": 0.0004278229255774166, "loss": 2.4313, "theoretical_loss": 3.317570284054087, "tokens_seen": 3037790208 }, { "epoch": 0.15, "learning_rate": 0.00042771599657827206, "loss": 2.4777, "theoretical_loss": 3.3175647951870197, "tokens_seen": 3037855744 }, { "epoch": 0.15, "objective/train/docs_used": 1665791, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.440596342086792, "objective/train/theoretical_loss": 3.317559306471518, "objective/train/tokens_used": 67972576, "theoretical_loss": 3.317559306471518, "tokens_seen": 3037921280 }, { "epoch": 0.15, "learning_rate": 0.0004276090675791275, "loss": 2.4501, "theoretical_loss": 3.317559306471518, "tokens_seen": 3037921280 }, { "epoch": 0.15, "learning_rate": 0.0004275021385799829, "loss": 2.3919, "theoretical_loss": 3.3175538179075743, "tokens_seen": 3037986816 }, { "epoch": 0.15, "learning_rate": 0.0004273952095808383, "loss": 2.683, "theoretical_loss": 3.317548329495181, "tokens_seen": 3038052352 }, { "epoch": 0.15, "learning_rate": 0.0004272882805816938, "loss": 2.6477, "theoretical_loss": 3.31754284123433, "tokens_seen": 3038117888 }, { "epoch": 0.15, "learning_rate": 0.00042718135158254915, "loss": 2.5088, "theoretical_loss": 3.3175373531250147, "tokens_seen": 3038183424 }, { "epoch": 0.15, "learning_rate": 0.00042707442258340463, "loss": 2.6919, "theoretical_loss": 3.3175318651672274, "tokens_seen": 3038248960 }, { "epoch": 0.15, "learning_rate": 0.00042696749358426005, "loss": 2.3505, "theoretical_loss": 3.3175263773609602, "tokens_seen": 3038314496 }, { "epoch": 0.15, "learning_rate": 0.0004268605645851155, "loss": 2.4019, "theoretical_loss": 3.3175208897062065, "tokens_seen": 3038380032 }, { "epoch": 0.16, "learning_rate": 0.00042675363558597094, "loss": 2.5938, "theoretical_loss": 3.3175154022029583, "tokens_seen": 3038445568 }, { "epoch": 0.16, "learning_rate": 0.00042664670658682636, "loss": 2.7483, "theoretical_loss": 3.317509914851208, "tokens_seen": 3038511104 }, { "epoch": 0.16, "learning_rate": 0.00042653977758768183, "loss": 2.5649, "theoretical_loss": 3.317504427650948, "tokens_seen": 3038576640 }, { "epoch": 0.16, "learning_rate": 0.0004264328485885372, "loss": 2.7034, "theoretical_loss": 3.3174989406021718, "tokens_seen": 3038642176 }, { "epoch": 0.16, "learning_rate": 0.00042632591958939267, "loss": 2.5488, "theoretical_loss": 3.3174934537048713, "tokens_seen": 3038707712 }, { "epoch": 0.16, "learning_rate": 0.0004262189905902481, "loss": 2.5863, "theoretical_loss": 3.317487966959039, "tokens_seen": 3038773248 }, { "epoch": 0.16, "learning_rate": 0.0004261120615911035, "loss": 2.678, "theoretical_loss": 3.317482480364667, "tokens_seen": 3038838784 }, { "epoch": 0.16, "learning_rate": 0.0004260051325919589, "loss": 2.4831, "theoretical_loss": 3.3174769939217494, "tokens_seen": 3038904320 }, { "epoch": 0.16, "learning_rate": 0.0004258982035928144, "loss": 2.5811, "theoretical_loss": 3.3174715076302776, "tokens_seen": 3038969856 }, { "epoch": 0.16, "learning_rate": 0.00042579127459366976, "loss": 2.5055, "theoretical_loss": 3.317466021490244, "tokens_seen": 3039035392 }, { "epoch": 0.16, "learning_rate": 0.00042568434559452523, "loss": 2.7085, "theoretical_loss": 3.3174605355016418, "tokens_seen": 3039100928 }, { "epoch": 0.16, "learning_rate": 0.0004255774165953807, "loss": 2.2972, "theoretical_loss": 3.3174550496644626, "tokens_seen": 3039166464 }, { "epoch": 0.16, "learning_rate": 0.0004254704875962361, "loss": 2.7515, "theoretical_loss": 3.3174495639787005, "tokens_seen": 3039232000 }, { "epoch": 0.16, "learning_rate": 0.00042536355859709154, "loss": 2.449, "theoretical_loss": 3.317444078444346, "tokens_seen": 3039297536 }, { "epoch": 0.16, "learning_rate": 0.00042525662959794696, "loss": 2.7037, "theoretical_loss": 3.317438593061394, "tokens_seen": 3039363072 }, { "epoch": 0.16, "learning_rate": 0.00042514970059880243, "loss": 2.6908, "theoretical_loss": 3.317433107829835, "tokens_seen": 3039428608 }, { "epoch": 0.16, "learning_rate": 0.0004250427715996578, "loss": 2.461, "theoretical_loss": 3.3174276227496633, "tokens_seen": 3039494144 }, { "epoch": 0.16, "objective/train/docs_used": 1666356, "objective/train/instantaneous_batch_size": 16, "objective/train/instantaneous_microbatch_size": 16384, "objective/train/original_loss": 2.657942295074463, "objective/train/theoretical_loss": 3.31742213782087, "objective/train/tokens_used": 69610976, "theoretical_loss": 3.31742213782087, "tokens_seen": 3039559680 }, { "epoch": 0.16, "learning_rate": 0.00042493584260051327, "loss": 2.6056, "theoretical_loss": 3.31742213782087, "tokens_seen": 3039559680 }, { "epoch": 0.16, "learning_rate": 0.0004248289136013687, "loss": 2.5217, "theoretical_loss": 3.3174166530434483, "tokens_seen": 3039625216 }, { "epoch": 0.16, "learning_rate": 0.0004247219846022241, "loss": 2.6368, "theoretical_loss": 3.3174111684173906, "tokens_seen": 3039690752 }, { "epoch": 0.16, "learning_rate": 0.0004246150556030796, "loss": 2.4843, "theoretical_loss": 3.3174056839426895, "tokens_seen": 3039756288 }, { "epoch": 0.16, "learning_rate": 0.000424508126603935, "loss": 2.671, "theoretical_loss": 3.317400199619338, "tokens_seen": 3039821824 }, { "epoch": 0.16, "learning_rate": 0.0004244011976047904, "loss": 2.4733, "theoretical_loss": 3.3173947154473282, "tokens_seen": 3039887360 }, { "epoch": 0.16, "learning_rate": 0.00042429426860564584, "loss": 2.688, "theoretical_loss": 3.3173892314266524, "tokens_seen": 3039952896 }, { "epoch": 0.16, "learning_rate": 0.0004241873396065013, "loss": 2.6233, "theoretical_loss": 3.3173837475573036, "tokens_seen": 3040018432 }, { "epoch": 0.16, "learning_rate": 0.00042408041060735673, "loss": 2.6475, "theoretical_loss": 3.3173782638392746, "tokens_seen": 3040083968 }, { "epoch": 0.16, "learning_rate": 0.00042397348160821215, "loss": 2.635, "theoretical_loss": 3.3173727802725574, "tokens_seen": 3040149504 } ], "max_steps": 4724, "num_train_epochs": 9223372036854775807, "total_flos": 2.5385043492864e+16, "trial_name": null, "trial_params": null }