|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.21422523285351397, |
|
"global_step": 1012, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 1.0416666666666666e-05, |
|
"loss": 3.1158, |
|
"theoretical_loss": 3.321573280713233, |
|
"tokens_seen": 2990473216 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 2.0833333333333333e-05, |
|
"loss": 2.8674, |
|
"theoretical_loss": 3.321567680436603, |
|
"tokens_seen": 2990538752 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 3.125e-05, |
|
"loss": 3.1083, |
|
"theoretical_loss": 3.321562080317061, |
|
"tokens_seen": 2990604288 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 4.1666666666666665e-05, |
|
"loss": 2.8175, |
|
"theoretical_loss": 3.3215564803546, |
|
"tokens_seen": 2990669824 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 5.208333333333334e-05, |
|
"loss": 2.8746, |
|
"theoretical_loss": 3.321550880549211, |
|
"tokens_seen": 2990735360 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 6.25e-05, |
|
"loss": 2.6234, |
|
"theoretical_loss": 3.321545280900887, |
|
"tokens_seen": 2990800896 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 7.291666666666667e-05, |
|
"loss": 2.6986, |
|
"theoretical_loss": 3.32153968140962, |
|
"tokens_seen": 2990866432 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 8.333333333333333e-05, |
|
"loss": 2.9684, |
|
"theoretical_loss": 3.3215340820754022, |
|
"tokens_seen": 2990931968 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.0289, |
|
"theoretical_loss": 3.321528482898225, |
|
"tokens_seen": 2990997504 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00010416666666666667, |
|
"loss": 2.5923, |
|
"theoretical_loss": 3.3215228838780817, |
|
"tokens_seen": 2991063040 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00011458333333333333, |
|
"loss": 2.7436, |
|
"theoretical_loss": 3.3215172850149637, |
|
"tokens_seen": 2991128576 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.000125, |
|
"loss": 2.7711, |
|
"theoretical_loss": 3.3215116863088636, |
|
"tokens_seen": 2991194112 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00013541666666666666, |
|
"loss": 2.4174, |
|
"theoretical_loss": 3.3215060877597735, |
|
"tokens_seen": 2991259648 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00014583333333333335, |
|
"loss": 2.5628, |
|
"theoretical_loss": 3.3215004893676854, |
|
"tokens_seen": 2991325184 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00015625, |
|
"loss": 2.5322, |
|
"theoretical_loss": 3.3214948911325908, |
|
"tokens_seen": 2991390720 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 2.5703, |
|
"theoretical_loss": 3.321489293054483, |
|
"tokens_seen": 2991456256 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00017708333333333335, |
|
"loss": 2.4464, |
|
"theoretical_loss": 3.3214836951333537, |
|
"tokens_seen": 2991521792 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.0001875, |
|
"loss": 2.5979, |
|
"theoretical_loss": 3.321478097369195, |
|
"tokens_seen": 2991587328 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00019791666666666666, |
|
"loss": 2.6103, |
|
"theoretical_loss": 3.321472499761999, |
|
"tokens_seen": 2991652864 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00020833333333333335, |
|
"loss": 2.5729, |
|
"theoretical_loss": 3.321466902311758, |
|
"tokens_seen": 2991718400 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00021875, |
|
"loss": 2.3812, |
|
"theoretical_loss": 3.321461305018464, |
|
"tokens_seen": 2991783936 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00022916666666666666, |
|
"loss": 2.461, |
|
"theoretical_loss": 3.3214557078821096, |
|
"tokens_seen": 2991849472 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"learning_rate": 0.00023958333333333335, |
|
"loss": 2.3162, |
|
"theoretical_loss": 3.3214501109026866, |
|
"tokens_seen": 2991915008 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00025, |
|
"loss": 2.4302, |
|
"theoretical_loss": 3.321444514080187, |
|
"tokens_seen": 2991980544 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"objective/train/docs_used": 1640731, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.594472885131836, |
|
"objective/train/theoretical_loss": 3.321438917414603, |
|
"objective/train/tokens_used": 22097376, |
|
"theoretical_loss": 3.321438917414603, |
|
"tokens_seen": 2992046080 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0002604166666666667, |
|
"loss": 2.6341, |
|
"theoretical_loss": 3.321438917414603, |
|
"tokens_seen": 2992046080 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0002708333333333333, |
|
"loss": 2.6529, |
|
"theoretical_loss": 3.321433320905927, |
|
"tokens_seen": 2992111616 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 2.6057, |
|
"theoretical_loss": 3.3214277245541513, |
|
"tokens_seen": 2992177152 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0002916666666666667, |
|
"loss": 2.6216, |
|
"theoretical_loss": 3.3214221283592678, |
|
"tokens_seen": 2992242688 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003020833333333333, |
|
"loss": 2.4586, |
|
"theoretical_loss": 3.321416532321269, |
|
"tokens_seen": 2992308224 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003125, |
|
"loss": 2.4143, |
|
"theoretical_loss": 3.321410936440146, |
|
"tokens_seen": 2992373760 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003229166666666667, |
|
"loss": 2.6421, |
|
"theoretical_loss": 3.3214053407158923, |
|
"tokens_seen": 2992439296 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003333333333333333, |
|
"loss": 2.4524, |
|
"theoretical_loss": 3.3213997451485, |
|
"tokens_seen": 2992504832 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00034375, |
|
"loss": 2.4758, |
|
"theoretical_loss": 3.32139414973796, |
|
"tokens_seen": 2992570368 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003541666666666667, |
|
"loss": 2.4524, |
|
"theoretical_loss": 3.3213885544842654, |
|
"tokens_seen": 2992635904 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003645833333333333, |
|
"loss": 2.7206, |
|
"theoretical_loss": 3.3213829593874085, |
|
"tokens_seen": 2992701440 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.000375, |
|
"loss": 2.5027, |
|
"theoretical_loss": 3.321377364447381, |
|
"tokens_seen": 2992766976 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003854166666666667, |
|
"loss": 2.6288, |
|
"theoretical_loss": 3.321371769664175, |
|
"tokens_seen": 2992832512 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0003958333333333333, |
|
"loss": 2.6262, |
|
"theoretical_loss": 3.3213661750377836, |
|
"tokens_seen": 2992898048 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00040625000000000004, |
|
"loss": 2.4324, |
|
"theoretical_loss": 3.321360580568198, |
|
"tokens_seen": 2992963584 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004166666666666667, |
|
"loss": 2.3967, |
|
"theoretical_loss": 3.3213549862554106, |
|
"tokens_seen": 2993029120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004270833333333333, |
|
"loss": 2.4334, |
|
"theoretical_loss": 3.321349392099414, |
|
"tokens_seen": 2993094656 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004375, |
|
"loss": 2.5264, |
|
"theoretical_loss": 3.3213437981001994, |
|
"tokens_seen": 2993160192 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004479166666666667, |
|
"loss": 2.3953, |
|
"theoretical_loss": 3.32133820425776, |
|
"tokens_seen": 2993225728 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004583333333333333, |
|
"loss": 2.5172, |
|
"theoretical_loss": 3.3213326105720875, |
|
"tokens_seen": 2993291264 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.00046875, |
|
"loss": 2.6647, |
|
"theoretical_loss": 3.321327017043174, |
|
"tokens_seen": 2993356800 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004791666666666667, |
|
"loss": 2.4897, |
|
"theoretical_loss": 3.3213214236710122, |
|
"tokens_seen": 2993422336 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004895833333333333, |
|
"loss": 2.5138, |
|
"theoretical_loss": 3.321315830455594, |
|
"tokens_seen": 2993487872 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0005, |
|
"loss": 2.372, |
|
"theoretical_loss": 3.321310237396911, |
|
"tokens_seen": 2993553408 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004998930710008554, |
|
"loss": 2.3235, |
|
"theoretical_loss": 3.321304644494956, |
|
"tokens_seen": 2993618944 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"objective/train/docs_used": 1641796, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 3.1646158695220947, |
|
"objective/train/theoretical_loss": 3.3212990517497207, |
|
"objective/train/tokens_used": 23735776, |
|
"theoretical_loss": 3.3212990517497207, |
|
"tokens_seen": 2993684480 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004997861420017108, |
|
"loss": 2.7205, |
|
"theoretical_loss": 3.3212990517497207, |
|
"tokens_seen": 2993684480 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004996792130025663, |
|
"loss": 2.6519, |
|
"theoretical_loss": 3.3212934591611982, |
|
"tokens_seen": 2993750016 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004995722840034218, |
|
"loss": 2.5422, |
|
"theoretical_loss": 3.3212878667293797, |
|
"tokens_seen": 2993815552 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004994653550042771, |
|
"loss": 2.6428, |
|
"theoretical_loss": 3.321282274454258, |
|
"tokens_seen": 2993881088 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004993584260051326, |
|
"loss": 2.5495, |
|
"theoretical_loss": 3.321276682335825, |
|
"tokens_seen": 2993946624 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.000499251497005988, |
|
"loss": 2.5962, |
|
"theoretical_loss": 3.321271090374073, |
|
"tokens_seen": 2994012160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004991445680068435, |
|
"loss": 2.5919, |
|
"theoretical_loss": 3.3212654985689936, |
|
"tokens_seen": 2994077696 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004990376390076989, |
|
"loss": 2.5942, |
|
"theoretical_loss": 3.32125990692058, |
|
"tokens_seen": 2994143232 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004989307100085543, |
|
"loss": 2.2569, |
|
"theoretical_loss": 3.3212543154288237, |
|
"tokens_seen": 2994208768 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004988237810094098, |
|
"loss": 2.446, |
|
"theoretical_loss": 3.321248724093717, |
|
"tokens_seen": 2994274304 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004987168520102651, |
|
"loss": 2.5982, |
|
"theoretical_loss": 3.3212431329152525, |
|
"tokens_seen": 2994339840 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004986099230111207, |
|
"loss": 2.3817, |
|
"theoretical_loss": 3.3212375418934217, |
|
"tokens_seen": 2994405376 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004985029940119761, |
|
"loss": 2.5162, |
|
"theoretical_loss": 3.321231951028217, |
|
"tokens_seen": 2994470912 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004983960650128315, |
|
"loss": 2.7158, |
|
"theoretical_loss": 3.321226360319631, |
|
"tokens_seen": 2994536448 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004982891360136869, |
|
"loss": 2.555, |
|
"theoretical_loss": 3.3212207697676552, |
|
"tokens_seen": 2994601984 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004981822070145423, |
|
"loss": 2.2944, |
|
"theoretical_loss": 3.3212151793722824, |
|
"tokens_seen": 2994667520 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004980752780153978, |
|
"loss": 2.4699, |
|
"theoretical_loss": 3.3212095891335043, |
|
"tokens_seen": 2994733056 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004979683490162532, |
|
"loss": 2.5024, |
|
"theoretical_loss": 3.321203999051314, |
|
"tokens_seen": 2994798592 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004978614200171087, |
|
"loss": 2.5346, |
|
"theoretical_loss": 3.321198409125702, |
|
"tokens_seen": 2994864128 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004977544910179641, |
|
"loss": 2.7699, |
|
"theoretical_loss": 3.321192819356662, |
|
"tokens_seen": 2994929664 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 0.0004976475620188195, |
|
"loss": 2.3356, |
|
"theoretical_loss": 3.321187229744186, |
|
"tokens_seen": 2994995200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000497540633019675, |
|
"loss": 2.4074, |
|
"theoretical_loss": 3.3211816402882652, |
|
"tokens_seen": 2995060736 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004974337040205304, |
|
"loss": 2.6453, |
|
"theoretical_loss": 3.321176050988893, |
|
"tokens_seen": 2995126272 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004973267750213858, |
|
"loss": 2.5405, |
|
"theoretical_loss": 3.3211704618460614, |
|
"tokens_seen": 2995191808 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004972198460222412, |
|
"loss": 2.645, |
|
"theoretical_loss": 3.3211648728597614, |
|
"tokens_seen": 2995257344 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"objective/train/docs_used": 1642448, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.835965394973755, |
|
"objective/train/theoretical_loss": 3.3211592840299864, |
|
"objective/train/tokens_used": 25374176, |
|
"theoretical_loss": 3.3211592840299864, |
|
"tokens_seen": 2995322880 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004971129170230966, |
|
"loss": 2.5601, |
|
"theoretical_loss": 3.3211592840299864, |
|
"tokens_seen": 2995322880 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004970059880239521, |
|
"loss": 2.6971, |
|
"theoretical_loss": 3.3211536953567284, |
|
"tokens_seen": 2995388416 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004968990590248076, |
|
"loss": 2.4257, |
|
"theoretical_loss": 3.321148106839979, |
|
"tokens_seen": 2995453952 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000496792130025663, |
|
"loss": 2.5419, |
|
"theoretical_loss": 3.321142518479731, |
|
"tokens_seen": 2995519488 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004966852010265184, |
|
"loss": 2.7436, |
|
"theoretical_loss": 3.321136930275977, |
|
"tokens_seen": 2995585024 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004965782720273738, |
|
"loss": 2.5704, |
|
"theoretical_loss": 3.321131342228708, |
|
"tokens_seen": 2995650560 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004964713430282293, |
|
"loss": 2.8543, |
|
"theoretical_loss": 3.321125754337917, |
|
"tokens_seen": 2995716096 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004963644140290847, |
|
"loss": 2.5733, |
|
"theoretical_loss": 3.321120166603596, |
|
"tokens_seen": 2995781632 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004962574850299401, |
|
"loss": 2.5429, |
|
"theoretical_loss": 3.3211145790257373, |
|
"tokens_seen": 2995847168 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004961505560307955, |
|
"loss": 2.7132, |
|
"theoretical_loss": 3.3211089916043326, |
|
"tokens_seen": 2995912704 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000496043627031651, |
|
"loss": 2.4883, |
|
"theoretical_loss": 3.3211034043393743, |
|
"tokens_seen": 2995978240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004959366980325064, |
|
"loss": 2.5549, |
|
"theoretical_loss": 3.3210978172308554, |
|
"tokens_seen": 2996043776 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004958297690333619, |
|
"loss": 2.7343, |
|
"theoretical_loss": 3.3210922302787673, |
|
"tokens_seen": 2996109312 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004957228400342173, |
|
"loss": 2.5603, |
|
"theoretical_loss": 3.3210866434831026, |
|
"tokens_seen": 2996174848 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004956159110350727, |
|
"loss": 2.7516, |
|
"theoretical_loss": 3.3210810568438527, |
|
"tokens_seen": 2996240384 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004955089820359281, |
|
"loss": 2.4148, |
|
"theoretical_loss": 3.3210754703610106, |
|
"tokens_seen": 2996305920 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004954020530367836, |
|
"loss": 2.5289, |
|
"theoretical_loss": 3.3210698840345687, |
|
"tokens_seen": 2996371456 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004952951240376391, |
|
"loss": 2.4535, |
|
"theoretical_loss": 3.321064297864518, |
|
"tokens_seen": 2996436992 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004951881950384944, |
|
"loss": 2.3, |
|
"theoretical_loss": 3.3210587118508523, |
|
"tokens_seen": 2996502528 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004950812660393499, |
|
"loss": 2.6429, |
|
"theoretical_loss": 3.3210531259935627, |
|
"tokens_seen": 2996568064 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004949743370402053, |
|
"loss": 2.4731, |
|
"theoretical_loss": 3.3210475402926414, |
|
"tokens_seen": 2996633600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004948674080410608, |
|
"loss": 2.636, |
|
"theoretical_loss": 3.321041954748081, |
|
"tokens_seen": 2996699136 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004947604790419162, |
|
"loss": 2.5385, |
|
"theoretical_loss": 3.3210363693598737, |
|
"tokens_seen": 2996764672 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004946535500427716, |
|
"loss": 2.4859, |
|
"theoretical_loss": 3.321030784128012, |
|
"tokens_seen": 2996830208 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000494546621043627, |
|
"loss": 2.5137, |
|
"theoretical_loss": 3.321025199052487, |
|
"tokens_seen": 2996895744 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"objective/train/docs_used": 1643593, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.5586180686950684, |
|
"objective/train/theoretical_loss": 3.321019614133292, |
|
"objective/train/tokens_used": 27012576, |
|
"theoretical_loss": 3.321019614133292, |
|
"tokens_seen": 2996961280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004944396920444824, |
|
"loss": 2.5684, |
|
"theoretical_loss": 3.321019614133292, |
|
"tokens_seen": 2996961280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000494332763045338, |
|
"loss": 2.5426, |
|
"theoretical_loss": 3.321014029370419, |
|
"tokens_seen": 2997026816 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004942258340461933, |
|
"loss": 2.6556, |
|
"theoretical_loss": 3.3210084447638595, |
|
"tokens_seen": 2997092352 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004941189050470488, |
|
"loss": 2.668, |
|
"theoretical_loss": 3.3210028603136066, |
|
"tokens_seen": 2997157888 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004940119760479042, |
|
"loss": 2.544, |
|
"theoretical_loss": 3.320997276019652, |
|
"tokens_seen": 2997223424 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004939050470487596, |
|
"loss": 2.4703, |
|
"theoretical_loss": 3.320991691881988, |
|
"tokens_seen": 2997288960 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000493798118049615, |
|
"loss": 2.4878, |
|
"theoretical_loss": 3.3209861079006067, |
|
"tokens_seen": 2997354496 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004936911890504705, |
|
"loss": 2.5617, |
|
"theoretical_loss": 3.320980524075501, |
|
"tokens_seen": 2997420032 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.000493584260051326, |
|
"loss": 2.6501, |
|
"theoretical_loss": 3.320974940406662, |
|
"tokens_seen": 2997485568 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004934773310521813, |
|
"loss": 2.4307, |
|
"theoretical_loss": 3.3209693568940826, |
|
"tokens_seen": 2997551104 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004933704020530368, |
|
"loss": 2.4372, |
|
"theoretical_loss": 3.320963773537755, |
|
"tokens_seen": 2997616640 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004932634730538923, |
|
"loss": 2.5929, |
|
"theoretical_loss": 3.3209581903376715, |
|
"tokens_seen": 2997682176 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004931565440547477, |
|
"loss": 2.3358, |
|
"theoretical_loss": 3.320952607293824, |
|
"tokens_seen": 2997747712 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004930496150556031, |
|
"loss": 2.6659, |
|
"theoretical_loss": 3.3209470244062045, |
|
"tokens_seen": 2997813248 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004929426860564585, |
|
"loss": 2.5873, |
|
"theoretical_loss": 3.320941441674806, |
|
"tokens_seen": 2997878784 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004928357570573139, |
|
"loss": 2.5936, |
|
"theoretical_loss": 3.32093585909962, |
|
"tokens_seen": 2997944320 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004927288280581693, |
|
"loss": 2.5437, |
|
"theoretical_loss": 3.320930276680639, |
|
"tokens_seen": 2998009856 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004926218990590249, |
|
"loss": 2.5207, |
|
"theoretical_loss": 3.320924694417855, |
|
"tokens_seen": 2998075392 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 0.0004925149700598803, |
|
"loss": 2.624, |
|
"theoretical_loss": 3.3209191123112607, |
|
"tokens_seen": 2998140928 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004924080410607357, |
|
"loss": 2.6163, |
|
"theoretical_loss": 3.3209135303608477, |
|
"tokens_seen": 2998206464 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004923011120615911, |
|
"loss": 2.502, |
|
"theoretical_loss": 3.320907948566609, |
|
"tokens_seen": 2998272000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004921941830624465, |
|
"loss": 2.4689, |
|
"theoretical_loss": 3.320902366928536, |
|
"tokens_seen": 2998337536 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.000492087254063302, |
|
"loss": 2.7454, |
|
"theoretical_loss": 3.3208967854466214, |
|
"tokens_seen": 2998403072 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004919803250641574, |
|
"loss": 2.601, |
|
"theoretical_loss": 3.320891204120857, |
|
"tokens_seen": 2998468608 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004918733960650129, |
|
"loss": 2.2926, |
|
"theoretical_loss": 3.3208856229512356, |
|
"tokens_seen": 2998534144 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"objective/train/docs_used": 1644132, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.451066732406616, |
|
"objective/train/theoretical_loss": 3.320880041937749, |
|
"objective/train/tokens_used": 28650976, |
|
"theoretical_loss": 3.320880041937749, |
|
"tokens_seen": 2998599680 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004917664670658682, |
|
"loss": 2.6377, |
|
"theoretical_loss": 3.320880041937749, |
|
"tokens_seen": 2998599680 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004916595380667237, |
|
"loss": 2.5554, |
|
"theoretical_loss": 3.3208744610803898, |
|
"tokens_seen": 2998665216 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004915526090675792, |
|
"loss": 2.6001, |
|
"theoretical_loss": 3.3208688803791495, |
|
"tokens_seen": 2998730752 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004914456800684346, |
|
"loss": 2.5503, |
|
"theoretical_loss": 3.320863299834021, |
|
"tokens_seen": 2998796288 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00049133875106929, |
|
"loss": 2.5515, |
|
"theoretical_loss": 3.3208577194449966, |
|
"tokens_seen": 2998861824 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004912318220701454, |
|
"loss": 2.5162, |
|
"theoretical_loss": 3.320852139212068, |
|
"tokens_seen": 2998927360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004911248930710008, |
|
"loss": 2.5311, |
|
"theoretical_loss": 3.3208465591352274, |
|
"tokens_seen": 2998992896 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004910179640718563, |
|
"loss": 2.4144, |
|
"theoretical_loss": 3.3208409792144677, |
|
"tokens_seen": 2999058432 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004909110350727117, |
|
"loss": 2.6587, |
|
"theoretical_loss": 3.3208353994497806, |
|
"tokens_seen": 2999123968 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004908041060735672, |
|
"loss": 2.4581, |
|
"theoretical_loss": 3.320829819841158, |
|
"tokens_seen": 2999189504 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004906971770744226, |
|
"loss": 2.5588, |
|
"theoretical_loss": 3.320824240388593, |
|
"tokens_seen": 2999255040 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.000490590248075278, |
|
"loss": 2.4129, |
|
"theoretical_loss": 3.320818661092077, |
|
"tokens_seen": 2999320576 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004904833190761335, |
|
"loss": 2.5726, |
|
"theoretical_loss": 3.320813081951603, |
|
"tokens_seen": 2999386112 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004903763900769889, |
|
"loss": 2.6172, |
|
"theoretical_loss": 3.3208075029671624, |
|
"tokens_seen": 2999451648 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004902694610778443, |
|
"loss": 2.6785, |
|
"theoretical_loss": 3.320801924138748, |
|
"tokens_seen": 2999517184 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004901625320786997, |
|
"loss": 2.6485, |
|
"theoretical_loss": 3.320796345466352, |
|
"tokens_seen": 2999582720 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004900556030795552, |
|
"loss": 2.5381, |
|
"theoretical_loss": 3.3207907669499663, |
|
"tokens_seen": 2999648256 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004899486740804106, |
|
"loss": 2.5339, |
|
"theoretical_loss": 3.320785188589584, |
|
"tokens_seen": 2999713792 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004898417450812661, |
|
"loss": 2.6513, |
|
"theoretical_loss": 3.320779610385196, |
|
"tokens_seen": 2999779328 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004897348160821215, |
|
"loss": 2.6681, |
|
"theoretical_loss": 3.3207740323367956, |
|
"tokens_seen": 2999844864 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004896278870829769, |
|
"loss": 2.6658, |
|
"theoretical_loss": 3.320768454444374, |
|
"tokens_seen": 2999910400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004895209580838323, |
|
"loss": 2.6133, |
|
"theoretical_loss": 3.3207628767079242, |
|
"tokens_seen": 2999975936 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004894140290846878, |
|
"loss": 2.3937, |
|
"theoretical_loss": 3.3207572991274388, |
|
"tokens_seen": 3000041472 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004893071000855432, |
|
"loss": 2.433, |
|
"theoretical_loss": 3.3207517217029094, |
|
"tokens_seen": 3000107008 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004892001710863986, |
|
"loss": 2.5588, |
|
"theoretical_loss": 3.3207461444343283, |
|
"tokens_seen": 3000172544 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"objective/train/docs_used": 1645467, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.7996082305908203, |
|
"objective/train/theoretical_loss": 3.3207405673216877, |
|
"objective/train/tokens_used": 30289376, |
|
"theoretical_loss": 3.3207405673216877, |
|
"tokens_seen": 3000238080 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004890932420872541, |
|
"loss": 2.7605, |
|
"theoretical_loss": 3.3207405673216877, |
|
"tokens_seen": 3000238080 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004889863130881094, |
|
"loss": 2.6024, |
|
"theoretical_loss": 3.3207349903649797, |
|
"tokens_seen": 3000303616 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.000488879384088965, |
|
"loss": 2.4747, |
|
"theoretical_loss": 3.320729413564197, |
|
"tokens_seen": 3000369152 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004887724550898204, |
|
"loss": 2.5954, |
|
"theoretical_loss": 3.3207238369193313, |
|
"tokens_seen": 3000434688 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004886655260906758, |
|
"loss": 2.6027, |
|
"theoretical_loss": 3.3207182604303753, |
|
"tokens_seen": 3000500224 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004885585970915312, |
|
"loss": 2.6733, |
|
"theoretical_loss": 3.3207126840973213, |
|
"tokens_seen": 3000565760 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004884516680923866, |
|
"loss": 2.5098, |
|
"theoretical_loss": 3.320707107920161, |
|
"tokens_seen": 3000631296 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004883447390932422, |
|
"loss": 2.4302, |
|
"theoretical_loss": 3.3207015318988873, |
|
"tokens_seen": 3000696832 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004882378100940975, |
|
"loss": 2.5576, |
|
"theoretical_loss": 3.3206959560334917, |
|
"tokens_seen": 3000762368 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00048813088109495294, |
|
"loss": 2.4798, |
|
"theoretical_loss": 3.320690380323967, |
|
"tokens_seen": 3000827904 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004880239520958084, |
|
"loss": 2.3385, |
|
"theoretical_loss": 3.320684804770305, |
|
"tokens_seen": 3000893440 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004879170230966638, |
|
"loss": 2.4282, |
|
"theoretical_loss": 3.3206792293724985, |
|
"tokens_seen": 3000958976 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00048781009409751925, |
|
"loss": 2.5659, |
|
"theoretical_loss": 3.3206736541305393, |
|
"tokens_seen": 3001024512 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004877031650983747, |
|
"loss": 2.7023, |
|
"theoretical_loss": 3.3206680790444194, |
|
"tokens_seen": 3001090048 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.0004875962360992301, |
|
"loss": 2.4634, |
|
"theoretical_loss": 3.3206625041141318, |
|
"tokens_seen": 3001155584 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 0.00048748930710008556, |
|
"loss": 2.7005, |
|
"theoretical_loss": 3.3206569293396684, |
|
"tokens_seen": 3001221120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.000487382378100941, |
|
"loss": 2.6176, |
|
"theoretical_loss": 3.3206513547210212, |
|
"tokens_seen": 3001286656 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048727544910179645, |
|
"loss": 2.4467, |
|
"theoretical_loss": 3.320645780258183, |
|
"tokens_seen": 3001352192 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004871685201026518, |
|
"loss": 2.5512, |
|
"theoretical_loss": 3.320640205951145, |
|
"tokens_seen": 3001417728 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004870615911035073, |
|
"loss": 2.5798, |
|
"theoretical_loss": 3.3206346317999005, |
|
"tokens_seen": 3001483264 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004869546621043627, |
|
"loss": 2.6427, |
|
"theoretical_loss": 3.3206290578044415, |
|
"tokens_seen": 3001548800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048684773310521813, |
|
"loss": 2.596, |
|
"theoretical_loss": 3.32062348396476, |
|
"tokens_seen": 3001614336 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004867408041060736, |
|
"loss": 2.6275, |
|
"theoretical_loss": 3.3206179102808484, |
|
"tokens_seen": 3001679872 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.000486633875106929, |
|
"loss": 2.6446, |
|
"theoretical_loss": 3.3206123367526987, |
|
"tokens_seen": 3001745408 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048652694610778444, |
|
"loss": 2.248, |
|
"theoretical_loss": 3.3206067633803036, |
|
"tokens_seen": 3001810944 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"objective/train/docs_used": 1646713, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.3031907081604004, |
|
"objective/train/theoretical_loss": 3.320601190163655, |
|
"objective/train/tokens_used": 31927776, |
|
"theoretical_loss": 3.320601190163655, |
|
"tokens_seen": 3001876480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048642001710863986, |
|
"loss": 2.5457, |
|
"theoretical_loss": 3.320601190163655, |
|
"tokens_seen": 3001876480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048631308810949533, |
|
"loss": 2.6029, |
|
"theoretical_loss": 3.320595617102745, |
|
"tokens_seen": 3001942016 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004862061591103507, |
|
"loss": 2.4349, |
|
"theoretical_loss": 3.320590044197566, |
|
"tokens_seen": 3002007552 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048609923011120617, |
|
"loss": 2.6641, |
|
"theoretical_loss": 3.320584471448111, |
|
"tokens_seen": 3002073088 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004859923011120616, |
|
"loss": 2.6479, |
|
"theoretical_loss": 3.320578898854371, |
|
"tokens_seen": 3002138624 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048588537211291706, |
|
"loss": 2.6043, |
|
"theoretical_loss": 3.3205733264163393, |
|
"tokens_seen": 3002204160 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004857784431137724, |
|
"loss": 2.5969, |
|
"theoretical_loss": 3.3205677541340073, |
|
"tokens_seen": 3002269696 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004856715141146279, |
|
"loss": 2.5909, |
|
"theoretical_loss": 3.320562182007368, |
|
"tokens_seen": 3002335232 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048556458511548337, |
|
"loss": 2.6341, |
|
"theoretical_loss": 3.3205566100364132, |
|
"tokens_seen": 3002400768 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048545765611633873, |
|
"loss": 2.5535, |
|
"theoretical_loss": 3.320551038221135, |
|
"tokens_seen": 3002466304 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004853507271171942, |
|
"loss": 2.6686, |
|
"theoretical_loss": 3.320545466561526, |
|
"tokens_seen": 3002531840 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004852437981180496, |
|
"loss": 2.4385, |
|
"theoretical_loss": 3.3205398950575784, |
|
"tokens_seen": 3002597376 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048513686911890504, |
|
"loss": 2.2543, |
|
"theoretical_loss": 3.3205343237092846, |
|
"tokens_seen": 3002662912 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048502994011976046, |
|
"loss": 2.2638, |
|
"theoretical_loss": 3.320528752516636, |
|
"tokens_seen": 3002728448 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048492301112061594, |
|
"loss": 2.6381, |
|
"theoretical_loss": 3.320523181479626, |
|
"tokens_seen": 3002793984 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048481608212147135, |
|
"loss": 2.3925, |
|
"theoretical_loss": 3.3205176105982463, |
|
"tokens_seen": 3002859520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048470915312232677, |
|
"loss": 2.4351, |
|
"theoretical_loss": 3.320512039872489, |
|
"tokens_seen": 3002925056 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048460222412318225, |
|
"loss": 2.5748, |
|
"theoretical_loss": 3.320506469302347, |
|
"tokens_seen": 3002990592 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048449529512403766, |
|
"loss": 2.6547, |
|
"theoretical_loss": 3.320500898887812, |
|
"tokens_seen": 3003056128 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004843883661248931, |
|
"loss": 2.5674, |
|
"theoretical_loss": 3.3204953286288763, |
|
"tokens_seen": 3003121664 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004842814371257485, |
|
"loss": 2.8503, |
|
"theoretical_loss": 3.3204897585255324, |
|
"tokens_seen": 3003187200 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.000484174508126604, |
|
"loss": 2.3909, |
|
"theoretical_loss": 3.3204841885777725, |
|
"tokens_seen": 3003252736 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048406757912745934, |
|
"loss": 2.6026, |
|
"theoretical_loss": 3.3204786187855886, |
|
"tokens_seen": 3003318272 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004839606501283148, |
|
"loss": 2.3427, |
|
"theoretical_loss": 3.3204730491489727, |
|
"tokens_seen": 3003383808 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048385372112917023, |
|
"loss": 2.5079, |
|
"theoretical_loss": 3.320467479667918, |
|
"tokens_seen": 3003449344 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"objective/train/docs_used": 1647443, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.66249418258667, |
|
"objective/train/theoretical_loss": 3.3204619103424164, |
|
"objective/train/tokens_used": 33566176, |
|
"theoretical_loss": 3.3204619103424164, |
|
"tokens_seen": 3003514880 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048374679213002565, |
|
"loss": 2.5391, |
|
"theoretical_loss": 3.3204619103424164, |
|
"tokens_seen": 3003514880 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004836398631308811, |
|
"loss": 2.5985, |
|
"theoretical_loss": 3.3204563411724597, |
|
"tokens_seen": 3003580416 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048353293413173654, |
|
"loss": 2.5948, |
|
"theoretical_loss": 3.3204507721580403, |
|
"tokens_seen": 3003645952 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.000483426005132592, |
|
"loss": 2.3641, |
|
"theoretical_loss": 3.320445203299151, |
|
"tokens_seen": 3003711488 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004833190761334474, |
|
"loss": 2.607, |
|
"theoretical_loss": 3.3204396345957834, |
|
"tokens_seen": 3003777024 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048321214713430285, |
|
"loss": 2.6218, |
|
"theoretical_loss": 3.3204340660479303, |
|
"tokens_seen": 3003842560 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048310521813515827, |
|
"loss": 2.7135, |
|
"theoretical_loss": 3.320428497655584, |
|
"tokens_seen": 3003908096 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004829982891360137, |
|
"loss": 2.7583, |
|
"theoretical_loss": 3.320422929418736, |
|
"tokens_seen": 3003973632 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004828913601368691, |
|
"loss": 2.3356, |
|
"theoretical_loss": 3.320417361337379, |
|
"tokens_seen": 3004039168 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004827844311377246, |
|
"loss": 2.6196, |
|
"theoretical_loss": 3.320411793411506, |
|
"tokens_seen": 3004104704 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.00048267750213858, |
|
"loss": 2.5513, |
|
"theoretical_loss": 3.3204062256411078, |
|
"tokens_seen": 3004170240 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004825705731394354, |
|
"loss": 2.3649, |
|
"theoretical_loss": 3.320400658026178, |
|
"tokens_seen": 3004235776 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 0.0004824636441402909, |
|
"loss": 2.5991, |
|
"theoretical_loss": 3.320395090566708, |
|
"tokens_seen": 3004301312 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048235671514114625, |
|
"loss": 2.5668, |
|
"theoretical_loss": 3.3203895232626905, |
|
"tokens_seen": 3004366848 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004822497861420017, |
|
"loss": 2.6184, |
|
"theoretical_loss": 3.3203839561141173, |
|
"tokens_seen": 3004432384 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048214285714285715, |
|
"loss": 2.4345, |
|
"theoretical_loss": 3.3203783891209815, |
|
"tokens_seen": 3004497920 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004820359281437126, |
|
"loss": 2.702, |
|
"theoretical_loss": 3.320372822283275, |
|
"tokens_seen": 3004563456 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.000481928999144568, |
|
"loss": 2.4543, |
|
"theoretical_loss": 3.3203672556009898, |
|
"tokens_seen": 3004628992 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048182207014542346, |
|
"loss": 2.7525, |
|
"theoretical_loss": 3.3203616890741183, |
|
"tokens_seen": 3004694528 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048171514114627893, |
|
"loss": 2.5572, |
|
"theoretical_loss": 3.320356122702653, |
|
"tokens_seen": 3004760064 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004816082121471343, |
|
"loss": 2.4409, |
|
"theoretical_loss": 3.3203505564865856, |
|
"tokens_seen": 3004825600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048150128314798977, |
|
"loss": 2.5298, |
|
"theoretical_loss": 3.320344990425909, |
|
"tokens_seen": 3004891136 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004813943541488452, |
|
"loss": 2.5959, |
|
"theoretical_loss": 3.3203394245206153, |
|
"tokens_seen": 3004956672 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004812874251497006, |
|
"loss": 2.5989, |
|
"theoretical_loss": 3.3203338587706965, |
|
"tokens_seen": 3005022208 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.000481180496150556, |
|
"loss": 2.3394, |
|
"theoretical_loss": 3.320328293176145, |
|
"tokens_seen": 3005087744 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"objective/train/docs_used": 1647973, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.042917013168335, |
|
"objective/train/theoretical_loss": 3.3203227277369534, |
|
"objective/train/tokens_used": 35204576, |
|
"theoretical_loss": 3.3203227277369534, |
|
"tokens_seen": 3005153280 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004810735671514115, |
|
"loss": 2.4204, |
|
"theoretical_loss": 3.3203227277369534, |
|
"tokens_seen": 3005153280 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048096663815226686, |
|
"loss": 2.4667, |
|
"theoretical_loss": 3.320317162453114, |
|
"tokens_seen": 3005218816 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048085970915312233, |
|
"loss": 2.6191, |
|
"theoretical_loss": 3.3203115973246184, |
|
"tokens_seen": 3005284352 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048075278015397775, |
|
"loss": 2.6692, |
|
"theoretical_loss": 3.3203060323514593, |
|
"tokens_seen": 3005349888 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004806458511548332, |
|
"loss": 2.6315, |
|
"theoretical_loss": 3.320300467533629, |
|
"tokens_seen": 3005415424 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048053892215568864, |
|
"loss": 2.3996, |
|
"theoretical_loss": 3.3202949028711197, |
|
"tokens_seen": 3005480960 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048043199315654406, |
|
"loss": 2.6156, |
|
"theoretical_loss": 3.320289338363924, |
|
"tokens_seen": 3005546496 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048032506415739953, |
|
"loss": 2.6943, |
|
"theoretical_loss": 3.3202837740120335, |
|
"tokens_seen": 3005612032 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004802181351582549, |
|
"loss": 2.5853, |
|
"theoretical_loss": 3.320278209815441, |
|
"tokens_seen": 3005677568 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00048011120615911037, |
|
"loss": 2.7011, |
|
"theoretical_loss": 3.3202726457741387, |
|
"tokens_seen": 3005743104 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004800042771599658, |
|
"loss": 2.6477, |
|
"theoretical_loss": 3.320267081888119, |
|
"tokens_seen": 3005808640 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004798973481608212, |
|
"loss": 2.5143, |
|
"theoretical_loss": 3.320261518157374, |
|
"tokens_seen": 3005874176 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004797904191616766, |
|
"loss": 2.5315, |
|
"theoretical_loss": 3.3202559545818957, |
|
"tokens_seen": 3005939712 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004796834901625321, |
|
"loss": 2.7104, |
|
"theoretical_loss": 3.3202503911616765, |
|
"tokens_seen": 3006005248 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047957656116338757, |
|
"loss": 2.313, |
|
"theoretical_loss": 3.320244827896709, |
|
"tokens_seen": 3006070784 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047946963216424294, |
|
"loss": 2.346, |
|
"theoretical_loss": 3.320239264786986, |
|
"tokens_seen": 3006136320 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004793627031650984, |
|
"loss": 2.4753, |
|
"theoretical_loss": 3.3202337018324983, |
|
"tokens_seen": 3006201856 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047925577416595383, |
|
"loss": 2.6649, |
|
"theoretical_loss": 3.3202281390332393, |
|
"tokens_seen": 3006267392 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047914884516680925, |
|
"loss": 2.4813, |
|
"theoretical_loss": 3.3202225763892015, |
|
"tokens_seen": 3006332928 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047904191616766467, |
|
"loss": 2.581, |
|
"theoretical_loss": 3.320217013900376, |
|
"tokens_seen": 3006398464 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047893498716852014, |
|
"loss": 2.5711, |
|
"theoretical_loss": 3.320211451566756, |
|
"tokens_seen": 3006464000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004788280581693755, |
|
"loss": 2.5481, |
|
"theoretical_loss": 3.3202058893883333, |
|
"tokens_seen": 3006529536 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.000478721129170231, |
|
"loss": 2.7041, |
|
"theoretical_loss": 3.320200327365101, |
|
"tokens_seen": 3006595072 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047861420017108645, |
|
"loss": 2.5201, |
|
"theoretical_loss": 3.3201947654970505, |
|
"tokens_seen": 3006660608 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004785072711719418, |
|
"loss": 2.6215, |
|
"theoretical_loss": 3.3201892037841745, |
|
"tokens_seen": 3006726144 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"objective/train/docs_used": 1649106, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.610767364501953, |
|
"objective/train/theoretical_loss": 3.320183642226465, |
|
"objective/train/tokens_used": 36842976, |
|
"theoretical_loss": 3.320183642226465, |
|
"tokens_seen": 3006791680 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004784003421727973, |
|
"loss": 2.6786, |
|
"theoretical_loss": 3.320183642226465, |
|
"tokens_seen": 3006791680 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004782934131736527, |
|
"loss": 2.6331, |
|
"theoretical_loss": 3.3201780808239145, |
|
"tokens_seen": 3006857216 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.0004781864841745082, |
|
"loss": 2.6597, |
|
"theoretical_loss": 3.3201725195765155, |
|
"tokens_seen": 3006922752 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047807955517536354, |
|
"loss": 2.5184, |
|
"theoretical_loss": 3.32016695848426, |
|
"tokens_seen": 3006988288 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.000477972626176219, |
|
"loss": 2.6672, |
|
"theoretical_loss": 3.3201613975471402, |
|
"tokens_seen": 3007053824 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047786569717707443, |
|
"loss": 2.6462, |
|
"theoretical_loss": 3.3201558367651485, |
|
"tokens_seen": 3007119360 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047775876817792985, |
|
"loss": 2.4911, |
|
"theoretical_loss": 3.3201502761382775, |
|
"tokens_seen": 3007184896 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047765183917878527, |
|
"loss": 2.6191, |
|
"theoretical_loss": 3.3201447156665194, |
|
"tokens_seen": 3007250432 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047754491017964074, |
|
"loss": 2.4458, |
|
"theoretical_loss": 3.320139155349866, |
|
"tokens_seen": 3007315968 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 0.00047743798118049616, |
|
"loss": 2.3037, |
|
"theoretical_loss": 3.32013359518831, |
|
"tokens_seen": 3007381504 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004773310521813516, |
|
"loss": 2.3071, |
|
"theoretical_loss": 3.3201280351818436, |
|
"tokens_seen": 3007447040 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047722412318220705, |
|
"loss": 2.6427, |
|
"theoretical_loss": 3.320122475330459, |
|
"tokens_seen": 3007512576 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004771171941830624, |
|
"loss": 2.5393, |
|
"theoretical_loss": 3.320116915634149, |
|
"tokens_seen": 3007578112 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004770102651839179, |
|
"loss": 2.2973, |
|
"theoretical_loss": 3.320111356092905, |
|
"tokens_seen": 3007643648 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004769033361847733, |
|
"loss": 2.7483, |
|
"theoretical_loss": 3.3201057967067205, |
|
"tokens_seen": 3007709184 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004767964071856288, |
|
"loss": 2.5022, |
|
"theoretical_loss": 3.320100237475587, |
|
"tokens_seen": 3007774720 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047668947818648415, |
|
"loss": 2.3979, |
|
"theoretical_loss": 3.3200946783994962, |
|
"tokens_seen": 3007840256 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004765825491873396, |
|
"loss": 2.651, |
|
"theoretical_loss": 3.320089119478441, |
|
"tokens_seen": 3007905792 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004764756201881951, |
|
"loss": 2.6367, |
|
"theoretical_loss": 3.3200835607124146, |
|
"tokens_seen": 3007971328 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047636869118905046, |
|
"loss": 2.4628, |
|
"theoretical_loss": 3.3200780021014085, |
|
"tokens_seen": 3008036864 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047626176218990593, |
|
"loss": 2.5774, |
|
"theoretical_loss": 3.3200724436454143, |
|
"tokens_seen": 3008102400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047615483319076135, |
|
"loss": 2.7166, |
|
"theoretical_loss": 3.320066885344425, |
|
"tokens_seen": 3008167936 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047604790419161677, |
|
"loss": 2.5213, |
|
"theoretical_loss": 3.3200613271984336, |
|
"tokens_seen": 3008233472 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004759409751924722, |
|
"loss": 2.6397, |
|
"theoretical_loss": 3.320055769207431, |
|
"tokens_seen": 3008299008 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047583404619332766, |
|
"loss": 2.5147, |
|
"theoretical_loss": 3.3200502113714108, |
|
"tokens_seen": 3008364544 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"objective/train/docs_used": 1649879, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.4313642978668213, |
|
"objective/train/theoretical_loss": 3.3200446536903643, |
|
"objective/train/tokens_used": 38481376, |
|
"theoretical_loss": 3.3200446536903643, |
|
"tokens_seen": 3008430080 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.000475727117194183, |
|
"loss": 2.7199, |
|
"theoretical_loss": 3.3200446536903643, |
|
"tokens_seen": 3008430080 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004756201881950385, |
|
"loss": 2.429, |
|
"theoretical_loss": 3.3200390961642845, |
|
"tokens_seen": 3008495616 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047551325919589397, |
|
"loss": 2.7449, |
|
"theoretical_loss": 3.320033538793163, |
|
"tokens_seen": 3008561152 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004754063301967494, |
|
"loss": 2.2979, |
|
"theoretical_loss": 3.3200279815769926, |
|
"tokens_seen": 3008626688 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004752994011976048, |
|
"loss": 2.5655, |
|
"theoretical_loss": 3.320022424515766, |
|
"tokens_seen": 3008692224 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004751924721984602, |
|
"loss": 2.3837, |
|
"theoretical_loss": 3.3200168676094743, |
|
"tokens_seen": 3008757760 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004750855431993157, |
|
"loss": 2.3228, |
|
"theoretical_loss": 3.320011310858111, |
|
"tokens_seen": 3008823296 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047497861420017106, |
|
"loss": 2.4002, |
|
"theoretical_loss": 3.320005754261668, |
|
"tokens_seen": 3008888832 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047487168520102653, |
|
"loss": 2.5257, |
|
"theoretical_loss": 3.320000197820137, |
|
"tokens_seen": 3008954368 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047476475620188195, |
|
"loss": 2.6074, |
|
"theoretical_loss": 3.319994641533511, |
|
"tokens_seen": 3009019904 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047465782720273737, |
|
"loss": 2.6883, |
|
"theoretical_loss": 3.3199890854017826, |
|
"tokens_seen": 3009085440 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047455089820359284, |
|
"loss": 2.3395, |
|
"theoretical_loss": 3.319983529424943, |
|
"tokens_seen": 3009150976 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047444396920444826, |
|
"loss": 2.4104, |
|
"theoretical_loss": 3.319977973602986, |
|
"tokens_seen": 3009216512 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047433704020530374, |
|
"loss": 2.4436, |
|
"theoretical_loss": 3.3199724179359027, |
|
"tokens_seen": 3009282048 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004742301112061591, |
|
"loss": 2.3034, |
|
"theoretical_loss": 3.3199668624236853, |
|
"tokens_seen": 3009347584 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047412318220701457, |
|
"loss": 2.4065, |
|
"theoretical_loss": 3.319961307066327, |
|
"tokens_seen": 3009413120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047401625320787, |
|
"loss": 2.5668, |
|
"theoretical_loss": 3.31995575186382, |
|
"tokens_seen": 3009478656 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004739093242087254, |
|
"loss": 2.5791, |
|
"theoretical_loss": 3.3199501968161558, |
|
"tokens_seen": 3009544192 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047380239520958083, |
|
"loss": 2.6598, |
|
"theoretical_loss": 3.3199446419233274, |
|
"tokens_seen": 3009609728 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004736954662104363, |
|
"loss": 2.7689, |
|
"theoretical_loss": 3.319939087185327, |
|
"tokens_seen": 3009675264 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047358853721129167, |
|
"loss": 2.6223, |
|
"theoretical_loss": 3.319933532602147, |
|
"tokens_seen": 3009740800 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047348160821214714, |
|
"loss": 2.4611, |
|
"theoretical_loss": 3.3199279781737796, |
|
"tokens_seen": 3009806336 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004733746792130026, |
|
"loss": 2.5834, |
|
"theoretical_loss": 3.3199224239002167, |
|
"tokens_seen": 3009871872 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.000473267750213858, |
|
"loss": 2.7345, |
|
"theoretical_loss": 3.3199168697814514, |
|
"tokens_seen": 3009937408 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047316082121471345, |
|
"loss": 2.456, |
|
"theoretical_loss": 3.3199113158174756, |
|
"tokens_seen": 3010002944 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"objective/train/docs_used": 1651225, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.715864896774292, |
|
"objective/train/theoretical_loss": 3.3199057620082812, |
|
"objective/train/tokens_used": 40119776, |
|
"theoretical_loss": 3.3199057620082812, |
|
"tokens_seen": 3010068480 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047305389221556887, |
|
"loss": 2.441, |
|
"theoretical_loss": 3.3199057620082812, |
|
"tokens_seen": 3010068480 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.00047294696321642434, |
|
"loss": 2.4536, |
|
"theoretical_loss": 3.3199002083538613, |
|
"tokens_seen": 3010134016 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004728400342172797, |
|
"loss": 2.6436, |
|
"theoretical_loss": 3.319894654854208, |
|
"tokens_seen": 3010199552 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004727331052181352, |
|
"loss": 2.3205, |
|
"theoretical_loss": 3.319889101509313, |
|
"tokens_seen": 3010265088 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004726261762189906, |
|
"loss": 2.6071, |
|
"theoretical_loss": 3.3198835483191695, |
|
"tokens_seen": 3010330624 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.000472519247219846, |
|
"loss": 2.6072, |
|
"theoretical_loss": 3.319877995283769, |
|
"tokens_seen": 3010396160 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004724123182207015, |
|
"loss": 2.5943, |
|
"theoretical_loss": 3.319872442403105, |
|
"tokens_seen": 3010461696 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 0.0004723053892215569, |
|
"loss": 2.4161, |
|
"theoretical_loss": 3.3198668896771686, |
|
"tokens_seen": 3010527232 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004721984602224123, |
|
"loss": 2.5087, |
|
"theoretical_loss": 3.3198613371059524, |
|
"tokens_seen": 3010592768 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047209153122326774, |
|
"loss": 2.5444, |
|
"theoretical_loss": 3.319855784689449, |
|
"tokens_seen": 3010658304 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004719846022241232, |
|
"loss": 2.6907, |
|
"theoretical_loss": 3.319850232427651, |
|
"tokens_seen": 3010723840 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004718776732249786, |
|
"loss": 2.4364, |
|
"theoretical_loss": 3.31984468032055, |
|
"tokens_seen": 3010789376 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047177074422583405, |
|
"loss": 2.3893, |
|
"theoretical_loss": 3.3198391283681383, |
|
"tokens_seen": 3010854912 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047166381522668947, |
|
"loss": 2.5048, |
|
"theoretical_loss": 3.3198335765704092, |
|
"tokens_seen": 3010920448 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047155688622754495, |
|
"loss": 2.6298, |
|
"theoretical_loss": 3.3198280249273546, |
|
"tokens_seen": 3010985984 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047144995722840036, |
|
"loss": 2.8107, |
|
"theoretical_loss": 3.319822473438966, |
|
"tokens_seen": 3011051520 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004713430282292558, |
|
"loss": 2.503, |
|
"theoretical_loss": 3.319816922105237, |
|
"tokens_seen": 3011117056 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047123609923011126, |
|
"loss": 2.5555, |
|
"theoretical_loss": 3.3198113709261587, |
|
"tokens_seen": 3011182592 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004711291702309666, |
|
"loss": 2.4985, |
|
"theoretical_loss": 3.319805819901724, |
|
"tokens_seen": 3011248128 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004710222412318221, |
|
"loss": 2.4865, |
|
"theoretical_loss": 3.3198002690319255, |
|
"tokens_seen": 3011313664 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004709153122326775, |
|
"loss": 2.5541, |
|
"theoretical_loss": 3.3197947183167553, |
|
"tokens_seen": 3011379200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047080838323353293, |
|
"loss": 2.4878, |
|
"theoretical_loss": 3.319789167756206, |
|
"tokens_seen": 3011444736 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047070145423438835, |
|
"loss": 2.4241, |
|
"theoretical_loss": 3.319783617350269, |
|
"tokens_seen": 3011510272 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004705945252352438, |
|
"loss": 2.6346, |
|
"theoretical_loss": 3.3197780670989374, |
|
"tokens_seen": 3011575808 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047048759623609924, |
|
"loss": 2.3981, |
|
"theoretical_loss": 3.319772517002204, |
|
"tokens_seen": 3011641344 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"objective/train/docs_used": 1651868, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.2719619274139404, |
|
"objective/train/theoretical_loss": 3.31976696706006, |
|
"objective/train/tokens_used": 41758176, |
|
"theoretical_loss": 3.31976696706006, |
|
"tokens_seen": 3011706880 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047038066723695466, |
|
"loss": 2.4118, |
|
"theoretical_loss": 3.31976696706006, |
|
"tokens_seen": 3011706880 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047027373823781013, |
|
"loss": 2.6711, |
|
"theoretical_loss": 3.319761417272498, |
|
"tokens_seen": 3011772416 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047016680923866555, |
|
"loss": 2.6537, |
|
"theoretical_loss": 3.319755867639511, |
|
"tokens_seen": 3011837952 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00047005988023952097, |
|
"loss": 2.6158, |
|
"theoretical_loss": 3.319750318161091, |
|
"tokens_seen": 3011903488 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004699529512403764, |
|
"loss": 2.2521, |
|
"theoretical_loss": 3.3197447688372295, |
|
"tokens_seen": 3011969024 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046984602224123186, |
|
"loss": 2.6413, |
|
"theoretical_loss": 3.3197392196679205, |
|
"tokens_seen": 3012034560 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004697390932420872, |
|
"loss": 2.7066, |
|
"theoretical_loss": 3.3197336706531546, |
|
"tokens_seen": 3012100096 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004696321642429427, |
|
"loss": 2.4956, |
|
"theoretical_loss": 3.3197281217929255, |
|
"tokens_seen": 3012165632 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046952523524379817, |
|
"loss": 2.7685, |
|
"theoretical_loss": 3.319722573087225, |
|
"tokens_seen": 3012231168 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046941830624465353, |
|
"loss": 2.6271, |
|
"theoretical_loss": 3.319717024536045, |
|
"tokens_seen": 3012296704 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.000469311377245509, |
|
"loss": 2.6398, |
|
"theoretical_loss": 3.3197114761393784, |
|
"tokens_seen": 3012362240 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004692044482463644, |
|
"loss": 2.5921, |
|
"theoretical_loss": 3.3197059278972176, |
|
"tokens_seen": 3012427776 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046909751924721984, |
|
"loss": 2.6227, |
|
"theoretical_loss": 3.3197003798095546, |
|
"tokens_seen": 3012493312 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046899059024807526, |
|
"loss": 2.6272, |
|
"theoretical_loss": 3.3196948318763817, |
|
"tokens_seen": 3012558848 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046888366124893074, |
|
"loss": 2.4564, |
|
"theoretical_loss": 3.3196892840976915, |
|
"tokens_seen": 3012624384 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046877673224978616, |
|
"loss": 2.5827, |
|
"theoretical_loss": 3.319683736473476, |
|
"tokens_seen": 3012689920 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004686698032506416, |
|
"loss": 2.5221, |
|
"theoretical_loss": 3.319678189003728, |
|
"tokens_seen": 3012755456 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.000468562874251497, |
|
"loss": 2.2411, |
|
"theoretical_loss": 3.3196726416884395, |
|
"tokens_seen": 3012820992 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046845594525235247, |
|
"loss": 2.5399, |
|
"theoretical_loss": 3.319667094527603, |
|
"tokens_seen": 3012886528 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004683490162532079, |
|
"loss": 2.5599, |
|
"theoretical_loss": 3.3196615475212106, |
|
"tokens_seen": 3012952064 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004682420872540633, |
|
"loss": 2.6186, |
|
"theoretical_loss": 3.319656000669255, |
|
"tokens_seen": 3013017600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004681351582549188, |
|
"loss": 2.4364, |
|
"theoretical_loss": 3.3196504539717284, |
|
"tokens_seen": 3013083136 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046802822925577414, |
|
"loss": 2.4645, |
|
"theoretical_loss": 3.319644907428623, |
|
"tokens_seen": 3013148672 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004679213002566296, |
|
"loss": 2.3282, |
|
"theoretical_loss": 3.3196393610399317, |
|
"tokens_seen": 3013214208 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046781437125748503, |
|
"loss": 2.3968, |
|
"theoretical_loss": 3.3196338148056457, |
|
"tokens_seen": 3013279744 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"objective/train/docs_used": 1652881, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 3.02851939201355, |
|
"objective/train/theoretical_loss": 3.3196282687257583, |
|
"objective/train/tokens_used": 43396576, |
|
"theoretical_loss": 3.3196282687257583, |
|
"tokens_seen": 3013345280 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004677074422583405, |
|
"loss": 2.73, |
|
"theoretical_loss": 3.3196282687257583, |
|
"tokens_seen": 3013345280 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046760051325919587, |
|
"loss": 2.7081, |
|
"theoretical_loss": 3.3196227228002617, |
|
"tokens_seen": 3013410816 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.00046749358426005134, |
|
"loss": 2.4167, |
|
"theoretical_loss": 3.3196171770291483, |
|
"tokens_seen": 3013476352 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004673866552609068, |
|
"loss": 2.6501, |
|
"theoretical_loss": 3.31961163141241, |
|
"tokens_seen": 3013541888 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"learning_rate": 0.0004672797262617622, |
|
"loss": 2.711, |
|
"theoretical_loss": 3.3196060859500394, |
|
"tokens_seen": 3013607424 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046717279726261765, |
|
"loss": 2.6538, |
|
"theoretical_loss": 3.319600540642029, |
|
"tokens_seen": 3013672960 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046706586826347307, |
|
"loss": 2.7554, |
|
"theoretical_loss": 3.319594995488371, |
|
"tokens_seen": 3013738496 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004669589392643285, |
|
"loss": 2.6815, |
|
"theoretical_loss": 3.3195894504890573, |
|
"tokens_seen": 3013804032 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004668520102651839, |
|
"loss": 2.4327, |
|
"theoretical_loss": 3.3195839056440812, |
|
"tokens_seen": 3013869568 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004667450812660394, |
|
"loss": 2.6057, |
|
"theoretical_loss": 3.3195783609534346, |
|
"tokens_seen": 3013935104 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046663815226689474, |
|
"loss": 2.8111, |
|
"theoretical_loss": 3.3195728164171094, |
|
"tokens_seen": 3014000640 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004665312232677502, |
|
"loss": 2.5688, |
|
"theoretical_loss": 3.3195672720350986, |
|
"tokens_seen": 3014066176 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004664242942686057, |
|
"loss": 2.4503, |
|
"theoretical_loss": 3.319561727807394, |
|
"tokens_seen": 3014131712 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004663173652694611, |
|
"loss": 2.6877, |
|
"theoretical_loss": 3.319556183733989, |
|
"tokens_seen": 3014197248 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046621043627031653, |
|
"loss": 2.6751, |
|
"theoretical_loss": 3.3195506398148744, |
|
"tokens_seen": 3014262784 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046610350727117195, |
|
"loss": 2.4173, |
|
"theoretical_loss": 3.3195450960500437, |
|
"tokens_seen": 3014328320 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004659965782720274, |
|
"loss": 2.6038, |
|
"theoretical_loss": 3.319539552439489, |
|
"tokens_seen": 3014393856 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004658896492728828, |
|
"loss": 2.638, |
|
"theoretical_loss": 3.3195340089832026, |
|
"tokens_seen": 3014459392 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046578272027373826, |
|
"loss": 2.5091, |
|
"theoretical_loss": 3.3195284656811763, |
|
"tokens_seen": 3014524928 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004656757912745937, |
|
"loss": 2.807, |
|
"theoretical_loss": 3.3195229225334035, |
|
"tokens_seen": 3014590464 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004655688622754491, |
|
"loss": 2.3861, |
|
"theoretical_loss": 3.319517379539876, |
|
"tokens_seen": 3014656000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004654619332763045, |
|
"loss": 2.8883, |
|
"theoretical_loss": 3.3195118367005856, |
|
"tokens_seen": 3014721536 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046535500427716, |
|
"loss": 2.6202, |
|
"theoretical_loss": 3.3195062940155258, |
|
"tokens_seen": 3014787072 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004652480752780154, |
|
"loss": 2.6868, |
|
"theoretical_loss": 3.3195007514846884, |
|
"tokens_seen": 3014852608 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004651411462788708, |
|
"loss": 2.4873, |
|
"theoretical_loss": 3.3194952091080654, |
|
"tokens_seen": 3014918144 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"objective/train/docs_used": 1653310, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 1.8691486120224, |
|
"objective/train/theoretical_loss": 3.3194896668856497, |
|
"objective/train/tokens_used": 45034976, |
|
"theoretical_loss": 3.3194896668856497, |
|
"tokens_seen": 3014983680 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004650342172797263, |
|
"loss": 2.2912, |
|
"theoretical_loss": 3.3194896668856497, |
|
"tokens_seen": 3014983680 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004649272882805817, |
|
"loss": 2.609, |
|
"theoretical_loss": 3.3194841248174334, |
|
"tokens_seen": 3015049216 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046482035928143713, |
|
"loss": 2.77, |
|
"theoretical_loss": 3.319478582903409, |
|
"tokens_seen": 3015114752 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046471343028229255, |
|
"loss": 2.8639, |
|
"theoretical_loss": 3.3194730411435684, |
|
"tokens_seen": 3015180288 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.000464606501283148, |
|
"loss": 2.601, |
|
"theoretical_loss": 3.3194674995379048, |
|
"tokens_seen": 3015245824 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004644995722840034, |
|
"loss": 2.2598, |
|
"theoretical_loss": 3.3194619580864098, |
|
"tokens_seen": 3015311360 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046439264328485886, |
|
"loss": 2.6682, |
|
"theoretical_loss": 3.3194564167890763, |
|
"tokens_seen": 3015376896 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046428571428571433, |
|
"loss": 2.5971, |
|
"theoretical_loss": 3.3194508756458965, |
|
"tokens_seen": 3015442432 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004641787852865697, |
|
"loss": 2.5004, |
|
"theoretical_loss": 3.319445334656862, |
|
"tokens_seen": 3015507968 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046407185628742517, |
|
"loss": 2.7767, |
|
"theoretical_loss": 3.319439793821967, |
|
"tokens_seen": 3015573504 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004639649272882806, |
|
"loss": 2.6512, |
|
"theoretical_loss": 3.3194342531412016, |
|
"tokens_seen": 3015639040 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.000463857998289136, |
|
"loss": 2.6137, |
|
"theoretical_loss": 3.3194287126145596, |
|
"tokens_seen": 3015704576 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046375106928999143, |
|
"loss": 2.4597, |
|
"theoretical_loss": 3.319423172242033, |
|
"tokens_seen": 3015770112 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004636441402908469, |
|
"loss": 2.5436, |
|
"theoretical_loss": 3.3194176320236144, |
|
"tokens_seen": 3015835648 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004635372112917023, |
|
"loss": 2.5871, |
|
"theoretical_loss": 3.319412091959296, |
|
"tokens_seen": 3015901184 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046343028229255774, |
|
"loss": 2.709, |
|
"theoretical_loss": 3.31940655204907, |
|
"tokens_seen": 3015966720 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004633233532934132, |
|
"loss": 2.7242, |
|
"theoretical_loss": 3.319401012292929, |
|
"tokens_seen": 3016032256 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046321642429426863, |
|
"loss": 2.4904, |
|
"theoretical_loss": 3.319395472690865, |
|
"tokens_seen": 3016097792 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046310949529512405, |
|
"loss": 2.4424, |
|
"theoretical_loss": 3.3193899332428707, |
|
"tokens_seen": 3016163328 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046300256629597947, |
|
"loss": 2.5032, |
|
"theoretical_loss": 3.3193843939489382, |
|
"tokens_seen": 3016228864 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046289563729683494, |
|
"loss": 2.702, |
|
"theoretical_loss": 3.3193788548090604, |
|
"tokens_seen": 3016294400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004627887082976903, |
|
"loss": 2.622, |
|
"theoretical_loss": 3.319373315823229, |
|
"tokens_seen": 3016359936 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004626817792985458, |
|
"loss": 2.7673, |
|
"theoretical_loss": 3.319367776991437, |
|
"tokens_seen": 3016425472 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004625748502994012, |
|
"loss": 2.4798, |
|
"theoretical_loss": 3.3193622383136763, |
|
"tokens_seen": 3016491008 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.00046246792130025667, |
|
"loss": 2.919, |
|
"theoretical_loss": 3.3193566997899397, |
|
"tokens_seen": 3016556544 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"objective/train/docs_used": 1654610, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.208578109741211, |
|
"objective/train/theoretical_loss": 3.3193511614202187, |
|
"objective/train/tokens_used": 46673376, |
|
"theoretical_loss": 3.3193511614202187, |
|
"tokens_seen": 3016622080 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004623609923011121, |
|
"loss": 2.5259, |
|
"theoretical_loss": 3.3193511614202187, |
|
"tokens_seen": 3016622080 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 0.0004622540633019675, |
|
"loss": 2.5413, |
|
"theoretical_loss": 3.319345623204507, |
|
"tokens_seen": 3016687616 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.000462147134302823, |
|
"loss": 2.7374, |
|
"theoretical_loss": 3.319340085142796, |
|
"tokens_seen": 3016753152 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046204020530367834, |
|
"loss": 2.4026, |
|
"theoretical_loss": 3.319334547235078, |
|
"tokens_seen": 3016818688 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004619332763045338, |
|
"loss": 2.5737, |
|
"theoretical_loss": 3.319329009481346, |
|
"tokens_seen": 3016884224 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046182634730538923, |
|
"loss": 2.4429, |
|
"theoretical_loss": 3.319323471881592, |
|
"tokens_seen": 3016949760 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046171941830624465, |
|
"loss": 2.6043, |
|
"theoretical_loss": 3.3193179344358086, |
|
"tokens_seen": 3017015296 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046161248930710007, |
|
"loss": 2.5731, |
|
"theoretical_loss": 3.3193123971439875, |
|
"tokens_seen": 3017080832 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046150556030795554, |
|
"loss": 2.6597, |
|
"theoretical_loss": 3.319306860006122, |
|
"tokens_seen": 3017146368 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004613986313088109, |
|
"loss": 2.5896, |
|
"theoretical_loss": 3.319301323022204, |
|
"tokens_seen": 3017211904 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004612917023096664, |
|
"loss": 2.7416, |
|
"theoretical_loss": 3.319295786192226, |
|
"tokens_seen": 3017277440 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046118477331052185, |
|
"loss": 2.763, |
|
"theoretical_loss": 3.3192902495161802, |
|
"tokens_seen": 3017342976 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046107784431137727, |
|
"loss": 2.6935, |
|
"theoretical_loss": 3.319284712994059, |
|
"tokens_seen": 3017408512 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004609709153122327, |
|
"loss": 2.6153, |
|
"theoretical_loss": 3.319279176625855, |
|
"tokens_seen": 3017474048 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004608639863130881, |
|
"loss": 2.5454, |
|
"theoretical_loss": 3.3192736404115606, |
|
"tokens_seen": 3017539584 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004607570573139436, |
|
"loss": 2.564, |
|
"theoretical_loss": 3.319268104351168, |
|
"tokens_seen": 3017605120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046065012831479895, |
|
"loss": 2.4576, |
|
"theoretical_loss": 3.3192625684446693, |
|
"tokens_seen": 3017670656 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004605431993156544, |
|
"loss": 2.8301, |
|
"theoretical_loss": 3.3192570326920574, |
|
"tokens_seen": 3017736192 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046043627031650984, |
|
"loss": 2.5667, |
|
"theoretical_loss": 3.3192514970933242, |
|
"tokens_seen": 3017801728 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046032934131736526, |
|
"loss": 2.2958, |
|
"theoretical_loss": 3.3192459616484626, |
|
"tokens_seen": 3017867264 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046022241231822073, |
|
"loss": 2.5172, |
|
"theoretical_loss": 3.319240426357465, |
|
"tokens_seen": 3017932800 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046011548331907615, |
|
"loss": 2.6184, |
|
"theoretical_loss": 3.319234891220323, |
|
"tokens_seen": 3017998336 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00046000855431993157, |
|
"loss": 2.6125, |
|
"theoretical_loss": 3.31922935623703, |
|
"tokens_seen": 3018063872 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.000459901625320787, |
|
"loss": 2.5618, |
|
"theoretical_loss": 3.3192238214075775, |
|
"tokens_seen": 3018129408 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045979469632164246, |
|
"loss": 2.6876, |
|
"theoretical_loss": 3.3192182867319584, |
|
"tokens_seen": 3018194944 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"objective/train/docs_used": 1655301, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.565507411956787, |
|
"objective/train/theoretical_loss": 3.319212752210165, |
|
"objective/train/tokens_used": 48311776, |
|
"theoretical_loss": 3.319212752210165, |
|
"tokens_seen": 3018260480 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004596877673224979, |
|
"loss": 2.4072, |
|
"theoretical_loss": 3.319212752210165, |
|
"tokens_seen": 3018260480 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004595808383233533, |
|
"loss": 2.565, |
|
"theoretical_loss": 3.3192072178421896, |
|
"tokens_seen": 3018326016 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004594739093242087, |
|
"loss": 2.7793, |
|
"theoretical_loss": 3.3192016836280245, |
|
"tokens_seen": 3018391552 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004593669803250642, |
|
"loss": 2.587, |
|
"theoretical_loss": 3.319196149567662, |
|
"tokens_seen": 3018457088 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004592600513259196, |
|
"loss": 2.6595, |
|
"theoretical_loss": 3.3191906156610953, |
|
"tokens_seen": 3018522624 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.000459153122326775, |
|
"loss": 2.6921, |
|
"theoretical_loss": 3.3191850819083157, |
|
"tokens_seen": 3018588160 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004590461933276305, |
|
"loss": 2.5763, |
|
"theoretical_loss": 3.3191795483093163, |
|
"tokens_seen": 3018653696 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045893926432848586, |
|
"loss": 2.5511, |
|
"theoretical_loss": 3.319174014864089, |
|
"tokens_seen": 3018719232 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045883233532934134, |
|
"loss": 2.4876, |
|
"theoretical_loss": 3.3191684815726266, |
|
"tokens_seen": 3018784768 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045872540633019675, |
|
"loss": 2.5574, |
|
"theoretical_loss": 3.319162948434921, |
|
"tokens_seen": 3018850304 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045861847733105217, |
|
"loss": 2.6569, |
|
"theoretical_loss": 3.3191574154509658, |
|
"tokens_seen": 3018915840 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004585115483319076, |
|
"loss": 2.6353, |
|
"theoretical_loss": 3.319151882620752, |
|
"tokens_seen": 3018981376 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045840461933276306, |
|
"loss": 2.6696, |
|
"theoretical_loss": 3.319146349944272, |
|
"tokens_seen": 3019046912 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045829769033361854, |
|
"loss": 2.5971, |
|
"theoretical_loss": 3.3191408174215193, |
|
"tokens_seen": 3019112448 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004581907613344739, |
|
"loss": 2.6662, |
|
"theoretical_loss": 3.319135285052486, |
|
"tokens_seen": 3019177984 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004580838323353294, |
|
"loss": 2.6373, |
|
"theoretical_loss": 3.3191297528371635, |
|
"tokens_seen": 3019243520 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004579769033361848, |
|
"loss": 2.4068, |
|
"theoretical_loss": 3.319124220775545, |
|
"tokens_seen": 3019309056 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004578699743370402, |
|
"loss": 2.5871, |
|
"theoretical_loss": 3.319118688867623, |
|
"tokens_seen": 3019374592 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045776304533789563, |
|
"loss": 2.5294, |
|
"theoretical_loss": 3.319113157113389, |
|
"tokens_seen": 3019440128 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.0004576561163387511, |
|
"loss": 2.4393, |
|
"theoretical_loss": 3.319107625512837, |
|
"tokens_seen": 3019505664 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045754918733960647, |
|
"loss": 2.6992, |
|
"theoretical_loss": 3.3191020940659577, |
|
"tokens_seen": 3019571200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045744225834046194, |
|
"loss": 2.5488, |
|
"theoretical_loss": 3.3190965627727445, |
|
"tokens_seen": 3019636736 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045733532934131736, |
|
"loss": 2.5638, |
|
"theoretical_loss": 3.3190910316331896, |
|
"tokens_seen": 3019702272 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 0.00045722840034217283, |
|
"loss": 2.6045, |
|
"theoretical_loss": 3.3190855006472857, |
|
"tokens_seen": 3019767808 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045712147134302825, |
|
"loss": 2.5425, |
|
"theoretical_loss": 3.319079969815024, |
|
"tokens_seen": 3019833344 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"objective/train/docs_used": 1656638, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.6638667583465576, |
|
"objective/train/theoretical_loss": 3.3190744391363984, |
|
"objective/train/tokens_used": 49950176, |
|
"theoretical_loss": 3.3190744391363984, |
|
"tokens_seen": 3019898880 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045701454234388367, |
|
"loss": 2.6716, |
|
"theoretical_loss": 3.3190744391363984, |
|
"tokens_seen": 3019898880 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045690761334473914, |
|
"loss": 2.6018, |
|
"theoretical_loss": 3.3190689086114005, |
|
"tokens_seen": 3019964416 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004568006843455945, |
|
"loss": 2.4702, |
|
"theoretical_loss": 3.3190633782400223, |
|
"tokens_seen": 3020029952 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045669375534645, |
|
"loss": 2.5531, |
|
"theoretical_loss": 3.3190578480222577, |
|
"tokens_seen": 3020095488 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004565868263473054, |
|
"loss": 2.5263, |
|
"theoretical_loss": 3.3190523179580973, |
|
"tokens_seen": 3020161024 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004564798973481608, |
|
"loss": 2.6229, |
|
"theoretical_loss": 3.3190467880475345, |
|
"tokens_seen": 3020226560 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045637296834901623, |
|
"loss": 2.479, |
|
"theoretical_loss": 3.3190412582905617, |
|
"tokens_seen": 3020292096 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004562660393498717, |
|
"loss": 2.6546, |
|
"theoretical_loss": 3.319035728687171, |
|
"tokens_seen": 3020357632 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004561591103507271, |
|
"loss": 2.5858, |
|
"theoretical_loss": 3.319030199237355, |
|
"tokens_seen": 3020423168 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045605218135158254, |
|
"loss": 2.3675, |
|
"theoretical_loss": 3.319024669941106, |
|
"tokens_seen": 3020488704 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.000455945252352438, |
|
"loss": 2.5846, |
|
"theoretical_loss": 3.3190191407984164, |
|
"tokens_seen": 3020554240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045583832335329344, |
|
"loss": 2.4535, |
|
"theoretical_loss": 3.319013611809279, |
|
"tokens_seen": 3020619776 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045573139435414885, |
|
"loss": 2.7591, |
|
"theoretical_loss": 3.3190080829736854, |
|
"tokens_seen": 3020685312 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004556244653550043, |
|
"loss": 2.5729, |
|
"theoretical_loss": 3.319002554291629, |
|
"tokens_seen": 3020750848 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045551753635585975, |
|
"loss": 2.5332, |
|
"theoretical_loss": 3.318997025763101, |
|
"tokens_seen": 3020816384 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004554106073567151, |
|
"loss": 2.5638, |
|
"theoretical_loss": 3.3189914973880947, |
|
"tokens_seen": 3020881920 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004553036783575706, |
|
"loss": 2.476, |
|
"theoretical_loss": 3.318985969166602, |
|
"tokens_seen": 3020947456 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045519674935842606, |
|
"loss": 2.7148, |
|
"theoretical_loss": 3.318980441098616, |
|
"tokens_seen": 3021012992 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004550898203592814, |
|
"loss": 2.5545, |
|
"theoretical_loss": 3.3189749131841286, |
|
"tokens_seen": 3021078528 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004549828913601369, |
|
"loss": 2.5361, |
|
"theoretical_loss": 3.3189693854231326, |
|
"tokens_seen": 3021144064 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004548759623609923, |
|
"loss": 2.7275, |
|
"theoretical_loss": 3.3189638578156195, |
|
"tokens_seen": 3021209600 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045476903336184773, |
|
"loss": 2.7548, |
|
"theoretical_loss": 3.3189583303615824, |
|
"tokens_seen": 3021275136 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045466210436270315, |
|
"loss": 2.6902, |
|
"theoretical_loss": 3.3189528030610136, |
|
"tokens_seen": 3021340672 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004545551753635586, |
|
"loss": 2.3308, |
|
"theoretical_loss": 3.318947275913906, |
|
"tokens_seen": 3021406208 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045444824636441404, |
|
"loss": 2.437, |
|
"theoretical_loss": 3.318941748920251, |
|
"tokens_seen": 3021471744 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"objective/train/docs_used": 1657764, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.8103880882263184, |
|
"objective/train/theoretical_loss": 3.318936222080042, |
|
"objective/train/tokens_used": 51588576, |
|
"theoretical_loss": 3.318936222080042, |
|
"tokens_seen": 3021537280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045434131736526946, |
|
"loss": 2.5186, |
|
"theoretical_loss": 3.318936222080042, |
|
"tokens_seen": 3021537280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045423438836612493, |
|
"loss": 2.619, |
|
"theoretical_loss": 3.318930695393271, |
|
"tokens_seen": 3021602816 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045412745936698035, |
|
"loss": 2.6483, |
|
"theoretical_loss": 3.3189251688599297, |
|
"tokens_seen": 3021668352 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045402053036783577, |
|
"loss": 2.446, |
|
"theoretical_loss": 3.3189196424800116, |
|
"tokens_seen": 3021733888 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004539136013686912, |
|
"loss": 2.2355, |
|
"theoretical_loss": 3.318914116253509, |
|
"tokens_seen": 3021799424 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045380667236954666, |
|
"loss": 2.4651, |
|
"theoretical_loss": 3.3189085901804134, |
|
"tokens_seen": 3021864960 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.000453699743370402, |
|
"loss": 2.3771, |
|
"theoretical_loss": 3.3189030642607182, |
|
"tokens_seen": 3021930496 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004535928143712575, |
|
"loss": 2.2859, |
|
"theoretical_loss": 3.3188975384944155, |
|
"tokens_seen": 3021996032 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004534858853721129, |
|
"loss": 2.4815, |
|
"theoretical_loss": 3.3188920128814976, |
|
"tokens_seen": 3022061568 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045337895637296834, |
|
"loss": 2.4833, |
|
"theoretical_loss": 3.318886487421957, |
|
"tokens_seen": 3022127104 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045327202737382375, |
|
"loss": 2.6793, |
|
"theoretical_loss": 3.3188809621157858, |
|
"tokens_seen": 3022192640 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045316509837467923, |
|
"loss": 2.4563, |
|
"theoretical_loss": 3.318875436962977, |
|
"tokens_seen": 3022258176 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004530581693755347, |
|
"loss": 2.5047, |
|
"theoretical_loss": 3.3188699119635228, |
|
"tokens_seen": 3022323712 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045295124037639006, |
|
"loss": 2.7077, |
|
"theoretical_loss": 3.3188643871174155, |
|
"tokens_seen": 3022389248 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045284431137724554, |
|
"loss": 2.5694, |
|
"theoretical_loss": 3.3188588624246473, |
|
"tokens_seen": 3022454784 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045273738237810096, |
|
"loss": 2.4641, |
|
"theoretical_loss": 3.318853337885211, |
|
"tokens_seen": 3022520320 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004526304533789564, |
|
"loss": 2.5008, |
|
"theoretical_loss": 3.318847813499099, |
|
"tokens_seen": 3022585856 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004525235243798118, |
|
"loss": 2.5933, |
|
"theoretical_loss": 3.318842289266304, |
|
"tokens_seen": 3022651392 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045241659538066727, |
|
"loss": 2.7056, |
|
"theoretical_loss": 3.318836765186817, |
|
"tokens_seen": 3022716928 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.00045230966638152263, |
|
"loss": 2.5471, |
|
"theoretical_loss": 3.3188312412606327, |
|
"tokens_seen": 3022782464 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004522027373823781, |
|
"loss": 2.6295, |
|
"theoretical_loss": 3.3188257174877416, |
|
"tokens_seen": 3022848000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 0.0004520958083832336, |
|
"loss": 2.5878, |
|
"theoretical_loss": 3.3188201938681368, |
|
"tokens_seen": 3022913536 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045198887938408894, |
|
"loss": 2.7199, |
|
"theoretical_loss": 3.3188146704018107, |
|
"tokens_seen": 3022979072 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004518819503849444, |
|
"loss": 2.441, |
|
"theoretical_loss": 3.318809147088756, |
|
"tokens_seen": 3023044608 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045177502138579983, |
|
"loss": 2.5828, |
|
"theoretical_loss": 3.318803623928965, |
|
"tokens_seen": 3023110144 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"objective/train/docs_used": 1658338, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.4892239570617676, |
|
"objective/train/theoretical_loss": 3.3187981009224297, |
|
"objective/train/tokens_used": 53226976, |
|
"theoretical_loss": 3.3187981009224297, |
|
"tokens_seen": 3023175680 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004516680923866553, |
|
"loss": 2.4785, |
|
"theoretical_loss": 3.3187981009224297, |
|
"tokens_seen": 3023175680 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045156116338751067, |
|
"loss": 2.4167, |
|
"theoretical_loss": 3.318792578069143, |
|
"tokens_seen": 3023241216 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045145423438836614, |
|
"loss": 2.4702, |
|
"theoretical_loss": 3.3187870553690972, |
|
"tokens_seen": 3023306752 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045134730538922156, |
|
"loss": 2.3209, |
|
"theoretical_loss": 3.3187815328222845, |
|
"tokens_seen": 3023372288 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.000451240376390077, |
|
"loss": 2.5124, |
|
"theoretical_loss": 3.3187760104286976, |
|
"tokens_seen": 3023437824 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045113344739093245, |
|
"loss": 2.5166, |
|
"theoretical_loss": 3.3187704881883286, |
|
"tokens_seen": 3023503360 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045102651839178787, |
|
"loss": 2.6266, |
|
"theoretical_loss": 3.3187649661011704, |
|
"tokens_seen": 3023568896 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004509195893926433, |
|
"loss": 2.6106, |
|
"theoretical_loss": 3.3187594441672155, |
|
"tokens_seen": 3023634432 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004508126603934987, |
|
"loss": 2.768, |
|
"theoretical_loss": 3.3187539223864557, |
|
"tokens_seen": 3023699968 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004507057313943542, |
|
"loss": 2.5288, |
|
"theoretical_loss": 3.3187484007588837, |
|
"tokens_seen": 3023765504 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004505988023952096, |
|
"loss": 2.4277, |
|
"theoretical_loss": 3.318742879284492, |
|
"tokens_seen": 3023831040 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.000450491873396065, |
|
"loss": 2.7191, |
|
"theoretical_loss": 3.3187373579632733, |
|
"tokens_seen": 3023896576 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045038494439692044, |
|
"loss": 2.5977, |
|
"theoretical_loss": 3.3187318367952194, |
|
"tokens_seen": 3023962112 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004502780153977759, |
|
"loss": 2.5787, |
|
"theoretical_loss": 3.3187263157803235, |
|
"tokens_seen": 3024027648 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045017108639863133, |
|
"loss": 2.7877, |
|
"theoretical_loss": 3.318720794918577, |
|
"tokens_seen": 3024093184 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00045006415739948675, |
|
"loss": 2.6564, |
|
"theoretical_loss": 3.3187152742099735, |
|
"tokens_seen": 3024158720 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004499572284003422, |
|
"loss": 2.7045, |
|
"theoretical_loss": 3.3187097536545047, |
|
"tokens_seen": 3024224256 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004498502994011976, |
|
"loss": 2.6133, |
|
"theoretical_loss": 3.3187042332521632, |
|
"tokens_seen": 3024289792 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044974337040205306, |
|
"loss": 2.4414, |
|
"theoretical_loss": 3.3186987130029415, |
|
"tokens_seen": 3024355328 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004496364414029085, |
|
"loss": 2.6137, |
|
"theoretical_loss": 3.318693192906832, |
|
"tokens_seen": 3024420864 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004495295124037639, |
|
"loss": 2.5737, |
|
"theoretical_loss": 3.3186876729638266, |
|
"tokens_seen": 3024486400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004494225834046193, |
|
"loss": 2.6552, |
|
"theoretical_loss": 3.318682153173919, |
|
"tokens_seen": 3024551936 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004493156544054748, |
|
"loss": 2.5312, |
|
"theoretical_loss": 3.3186766335371005, |
|
"tokens_seen": 3024617472 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044920872540633026, |
|
"loss": 2.5598, |
|
"theoretical_loss": 3.3186711140533642, |
|
"tokens_seen": 3024683008 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004491017964071856, |
|
"loss": 2.6287, |
|
"theoretical_loss": 3.318665594722702, |
|
"tokens_seen": 3024748544 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"objective/train/docs_used": 1659421, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.7263076305389404, |
|
"objective/train/theoretical_loss": 3.3186600755451066, |
|
"objective/train/tokens_used": 54865376, |
|
"theoretical_loss": 3.3186600755451066, |
|
"tokens_seen": 3024814080 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004489948674080411, |
|
"loss": 2.5863, |
|
"theoretical_loss": 3.3186600755451066, |
|
"tokens_seen": 3024814080 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004488879384088965, |
|
"loss": 2.4847, |
|
"theoretical_loss": 3.3186545565205705, |
|
"tokens_seen": 3024879616 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044878100940975193, |
|
"loss": 2.6246, |
|
"theoretical_loss": 3.3186490376490863, |
|
"tokens_seen": 3024945152 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044867408041060735, |
|
"loss": 2.4603, |
|
"theoretical_loss": 3.318643518930646, |
|
"tokens_seen": 3025010688 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004485671514114628, |
|
"loss": 2.5178, |
|
"theoretical_loss": 3.318638000365242, |
|
"tokens_seen": 3025076224 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004484602224123182, |
|
"loss": 2.5561, |
|
"theoretical_loss": 3.3186324819528674, |
|
"tokens_seen": 3025141760 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044835329341317366, |
|
"loss": 2.5152, |
|
"theoretical_loss": 3.318626963693514, |
|
"tokens_seen": 3025207296 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004482463644140291, |
|
"loss": 2.4108, |
|
"theoretical_loss": 3.318621445587175, |
|
"tokens_seen": 3025272832 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004481394354148845, |
|
"loss": 2.7335, |
|
"theoretical_loss": 3.3186159276338416, |
|
"tokens_seen": 3025338368 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044803250641573997, |
|
"loss": 2.4303, |
|
"theoretical_loss": 3.3186104098335076, |
|
"tokens_seen": 3025403904 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004479255774165954, |
|
"loss": 2.3138, |
|
"theoretical_loss": 3.3186048921861646, |
|
"tokens_seen": 3025469440 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044781864841745086, |
|
"loss": 2.4013, |
|
"theoretical_loss": 3.318599374691805, |
|
"tokens_seen": 3025534976 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044771171941830623, |
|
"loss": 2.3985, |
|
"theoretical_loss": 3.3185938573504217, |
|
"tokens_seen": 3025600512 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004476047904191617, |
|
"loss": 2.6061, |
|
"theoretical_loss": 3.318588340162007, |
|
"tokens_seen": 3025666048 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.0004474978614200171, |
|
"loss": 2.659, |
|
"theoretical_loss": 3.3185828231265533, |
|
"tokens_seen": 3025731584 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044739093242087254, |
|
"loss": 2.2818, |
|
"theoretical_loss": 3.3185773062440527, |
|
"tokens_seen": 3025797120 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044728400342172796, |
|
"loss": 2.4729, |
|
"theoretical_loss": 3.3185717895144986, |
|
"tokens_seen": 3025862656 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044717707442258343, |
|
"loss": 2.6619, |
|
"theoretical_loss": 3.3185662729378826, |
|
"tokens_seen": 3025928192 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 0.00044707014542343885, |
|
"loss": 2.5107, |
|
"theoretical_loss": 3.3185607565141972, |
|
"tokens_seen": 3025993728 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044696321642429427, |
|
"loss": 2.3266, |
|
"theoretical_loss": 3.318555240243435, |
|
"tokens_seen": 3026059264 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044685628742514974, |
|
"loss": 2.6211, |
|
"theoretical_loss": 3.318549724125589, |
|
"tokens_seen": 3026124800 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004467493584260051, |
|
"loss": 2.3811, |
|
"theoretical_loss": 3.3185442081606507, |
|
"tokens_seen": 3026190336 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004466424294268606, |
|
"loss": 2.5501, |
|
"theoretical_loss": 3.318538692348613, |
|
"tokens_seen": 3026255872 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.000446535500427716, |
|
"loss": 2.7668, |
|
"theoretical_loss": 3.3185331766894683, |
|
"tokens_seen": 3026321408 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044642857142857147, |
|
"loss": 2.6323, |
|
"theoretical_loss": 3.318527661183209, |
|
"tokens_seen": 3026386944 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"objective/train/docs_used": 1660021, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 3.0193262100219727, |
|
"objective/train/theoretical_loss": 3.318522145829828, |
|
"objective/train/tokens_used": 56503776, |
|
"theoretical_loss": 3.318522145829828, |
|
"tokens_seen": 3026452480 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044632164242942683, |
|
"loss": 2.8896, |
|
"theoretical_loss": 3.318522145829828, |
|
"tokens_seen": 3026452480 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004462147134302823, |
|
"loss": 2.4917, |
|
"theoretical_loss": 3.318516630629317, |
|
"tokens_seen": 3026518016 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004461077844311378, |
|
"loss": 2.3977, |
|
"theoretical_loss": 3.318511115581669, |
|
"tokens_seen": 3026583552 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044600085543199314, |
|
"loss": 2.5538, |
|
"theoretical_loss": 3.3185056006868763, |
|
"tokens_seen": 3026649088 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004458939264328486, |
|
"loss": 2.41, |
|
"theoretical_loss": 3.3185000859449314, |
|
"tokens_seen": 3026714624 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044578699743370403, |
|
"loss": 2.4, |
|
"theoretical_loss": 3.318494571355827, |
|
"tokens_seen": 3026780160 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044568006843455945, |
|
"loss": 2.7303, |
|
"theoretical_loss": 3.3184890569195544, |
|
"tokens_seen": 3026845696 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044557313943541487, |
|
"loss": 2.4075, |
|
"theoretical_loss": 3.3184835426361077, |
|
"tokens_seen": 3026911232 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044546621043627034, |
|
"loss": 2.4872, |
|
"theoretical_loss": 3.3184780285054782, |
|
"tokens_seen": 3026976768 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044535928143712576, |
|
"loss": 2.4217, |
|
"theoretical_loss": 3.318472514527659, |
|
"tokens_seen": 3027042304 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004452523524379812, |
|
"loss": 2.6315, |
|
"theoretical_loss": 3.318467000702642, |
|
"tokens_seen": 3027107840 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004451454234388366, |
|
"loss": 2.553, |
|
"theoretical_loss": 3.31846148703042, |
|
"tokens_seen": 3027173376 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004450384944396921, |
|
"loss": 2.3969, |
|
"theoretical_loss": 3.3184559735109853, |
|
"tokens_seen": 3027238912 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004449315654405475, |
|
"loss": 2.4959, |
|
"theoretical_loss": 3.3184504601443305, |
|
"tokens_seen": 3027304448 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004448246364414029, |
|
"loss": 2.7329, |
|
"theoretical_loss": 3.3184449469304482, |
|
"tokens_seen": 3027369984 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004447177074422584, |
|
"loss": 2.6387, |
|
"theoretical_loss": 3.318439433869331, |
|
"tokens_seen": 3027435520 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044461077844311375, |
|
"loss": 2.5593, |
|
"theoretical_loss": 3.3184339209609703, |
|
"tokens_seen": 3027501056 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004445038494439692, |
|
"loss": 2.5025, |
|
"theoretical_loss": 3.3184284082053597, |
|
"tokens_seen": 3027566592 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044439692044482464, |
|
"loss": 2.8454, |
|
"theoretical_loss": 3.318422895602491, |
|
"tokens_seen": 3027632128 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044428999144568006, |
|
"loss": 2.5777, |
|
"theoretical_loss": 3.318417383152357, |
|
"tokens_seen": 3027697664 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004441830624465355, |
|
"loss": 2.5114, |
|
"theoretical_loss": 3.31841187085495, |
|
"tokens_seen": 3027763200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044407613344739095, |
|
"loss": 2.4808, |
|
"theoretical_loss": 3.3184063587102632, |
|
"tokens_seen": 3027828736 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004439692044482464, |
|
"loss": 2.6478, |
|
"theoretical_loss": 3.318400846718288, |
|
"tokens_seen": 3027894272 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004438622754491018, |
|
"loss": 2.3825, |
|
"theoretical_loss": 3.318395334879017, |
|
"tokens_seen": 3027959808 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044375534644995726, |
|
"loss": 2.4853, |
|
"theoretical_loss": 3.318389823192443, |
|
"tokens_seen": 3028025344 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"objective/train/docs_used": 1661280, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.6410443782806396, |
|
"objective/train/theoretical_loss": 3.3183843116585585, |
|
"objective/train/tokens_used": 58142176, |
|
"theoretical_loss": 3.3183843116585585, |
|
"tokens_seen": 3028090880 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004436484174508127, |
|
"loss": 2.469, |
|
"theoretical_loss": 3.3183843116585585, |
|
"tokens_seen": 3028090880 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004435414884516681, |
|
"loss": 2.7035, |
|
"theoretical_loss": 3.318378800277356, |
|
"tokens_seen": 3028156416 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004434345594525235, |
|
"loss": 2.5603, |
|
"theoretical_loss": 3.318373289048828, |
|
"tokens_seen": 3028221952 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.000443327630453379, |
|
"loss": 2.5599, |
|
"theoretical_loss": 3.3183677779729663, |
|
"tokens_seen": 3028287488 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044322070145423435, |
|
"loss": 2.5114, |
|
"theoretical_loss": 3.318362267049764, |
|
"tokens_seen": 3028353024 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004431137724550898, |
|
"loss": 2.6164, |
|
"theoretical_loss": 3.3183567562792136, |
|
"tokens_seen": 3028418560 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004430068434559453, |
|
"loss": 2.6386, |
|
"theoretical_loss": 3.318351245661307, |
|
"tokens_seen": 3028484096 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044289991445680066, |
|
"loss": 2.6347, |
|
"theoretical_loss": 3.3183457351960377, |
|
"tokens_seen": 3028549632 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044279298545765614, |
|
"loss": 2.6369, |
|
"theoretical_loss": 3.3183402248833973, |
|
"tokens_seen": 3028615168 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044268605645851155, |
|
"loss": 2.6041, |
|
"theoretical_loss": 3.3183347147233784, |
|
"tokens_seen": 3028680704 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044257912745936703, |
|
"loss": 2.4263, |
|
"theoretical_loss": 3.318329204715974, |
|
"tokens_seen": 3028746240 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004424721984602224, |
|
"loss": 2.5039, |
|
"theoretical_loss": 3.3183236948611756, |
|
"tokens_seen": 3028811776 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.00044236526946107786, |
|
"loss": 2.6112, |
|
"theoretical_loss": 3.3183181851589763, |
|
"tokens_seen": 3028877312 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004422583404619333, |
|
"loss": 2.512, |
|
"theoretical_loss": 3.3183126756093686, |
|
"tokens_seen": 3028942848 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004421514114627887, |
|
"loss": 2.4492, |
|
"theoretical_loss": 3.318307166212345, |
|
"tokens_seen": 3029008384 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"learning_rate": 0.0004420444824636442, |
|
"loss": 2.7508, |
|
"theoretical_loss": 3.318301656967898, |
|
"tokens_seen": 3029073920 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004419375534644996, |
|
"loss": 2.6316, |
|
"theoretical_loss": 3.318296147876019, |
|
"tokens_seen": 3029139456 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.000441830624465355, |
|
"loss": 2.6729, |
|
"theoretical_loss": 3.3182906389367024, |
|
"tokens_seen": 3029204992 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044172369546621043, |
|
"loss": 2.7944, |
|
"theoretical_loss": 3.318285130149939, |
|
"tokens_seen": 3029270528 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004416167664670659, |
|
"loss": 2.557, |
|
"theoretical_loss": 3.3182796215157224, |
|
"tokens_seen": 3029336064 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044150983746792127, |
|
"loss": 2.5116, |
|
"theoretical_loss": 3.3182741130340445, |
|
"tokens_seen": 3029401600 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044140290846877674, |
|
"loss": 2.3178, |
|
"theoretical_loss": 3.318268604704898, |
|
"tokens_seen": 3029467136 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044129597946963216, |
|
"loss": 2.4742, |
|
"theoretical_loss": 3.318263096528275, |
|
"tokens_seen": 3029532672 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044118905047048763, |
|
"loss": 2.4291, |
|
"theoretical_loss": 3.318257588504168, |
|
"tokens_seen": 3029598208 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.000441082121471343, |
|
"loss": 2.3404, |
|
"theoretical_loss": 3.31825208063257, |
|
"tokens_seen": 3029663744 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"objective/train/docs_used": 1661975, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.466115951538086, |
|
"objective/train/theoretical_loss": 3.318246572913474, |
|
"objective/train/tokens_used": 59780576, |
|
"theoretical_loss": 3.318246572913474, |
|
"tokens_seen": 3029729280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044097519247219847, |
|
"loss": 2.4917, |
|
"theoretical_loss": 3.318246572913474, |
|
"tokens_seen": 3029729280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044086826347305394, |
|
"loss": 2.4753, |
|
"theoretical_loss": 3.3182410653468706, |
|
"tokens_seen": 3029794816 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004407613344739093, |
|
"loss": 2.6818, |
|
"theoretical_loss": 3.318235557932754, |
|
"tokens_seen": 3029860352 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004406544054747648, |
|
"loss": 2.5633, |
|
"theoretical_loss": 3.3182300506711155, |
|
"tokens_seen": 3029925888 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004405474764756202, |
|
"loss": 2.3311, |
|
"theoretical_loss": 3.318224543561948, |
|
"tokens_seen": 3029991424 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004404405474764756, |
|
"loss": 2.323, |
|
"theoretical_loss": 3.318219036605245, |
|
"tokens_seen": 3030056960 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044033361847733104, |
|
"loss": 2.4226, |
|
"theoretical_loss": 3.3182135298009974, |
|
"tokens_seen": 3030122496 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004402266894781865, |
|
"loss": 2.2801, |
|
"theoretical_loss": 3.3182080231491984, |
|
"tokens_seen": 3030188032 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044011976047904193, |
|
"loss": 2.8268, |
|
"theoretical_loss": 3.3182025166498406, |
|
"tokens_seen": 3030253568 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00044001283147989735, |
|
"loss": 2.6459, |
|
"theoretical_loss": 3.3181970103029164, |
|
"tokens_seen": 3030319104 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004399059024807528, |
|
"loss": 2.4968, |
|
"theoretical_loss": 3.3181915041084182, |
|
"tokens_seen": 3030384640 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043979897348160824, |
|
"loss": 2.5702, |
|
"theoretical_loss": 3.3181859980663386, |
|
"tokens_seen": 3030450176 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043969204448246366, |
|
"loss": 2.4552, |
|
"theoretical_loss": 3.3181804921766695, |
|
"tokens_seen": 3030515712 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004395851154833191, |
|
"loss": 2.3615, |
|
"theoretical_loss": 3.3181749864394043, |
|
"tokens_seen": 3030581248 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043947818648417455, |
|
"loss": 2.5737, |
|
"theoretical_loss": 3.318169480854535, |
|
"tokens_seen": 3030646784 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004393712574850299, |
|
"loss": 2.6029, |
|
"theoretical_loss": 3.318163975422054, |
|
"tokens_seen": 3030712320 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004392643284858854, |
|
"loss": 2.4033, |
|
"theoretical_loss": 3.318158470141954, |
|
"tokens_seen": 3030777856 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004391573994867408, |
|
"loss": 2.5225, |
|
"theoretical_loss": 3.318152965014227, |
|
"tokens_seen": 3030843392 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004390504704875962, |
|
"loss": 2.4062, |
|
"theoretical_loss": 3.3181474600388667, |
|
"tokens_seen": 3030908928 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004389435414884517, |
|
"loss": 2.5334, |
|
"theoretical_loss": 3.3181419552158644, |
|
"tokens_seen": 3030974464 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004388366124893071, |
|
"loss": 2.2213, |
|
"theoretical_loss": 3.318136450545213, |
|
"tokens_seen": 3031040000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004387296834901626, |
|
"loss": 2.5278, |
|
"theoretical_loss": 3.318130946026905, |
|
"tokens_seen": 3031105536 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043862275449101795, |
|
"loss": 2.5084, |
|
"theoretical_loss": 3.318125441660933, |
|
"tokens_seen": 3031171072 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004385158254918734, |
|
"loss": 2.6303, |
|
"theoretical_loss": 3.318119937447289, |
|
"tokens_seen": 3031236608 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043840889649272884, |
|
"loss": 2.5961, |
|
"theoretical_loss": 3.318114433385966, |
|
"tokens_seen": 3031302144 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"objective/train/docs_used": 1662642, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.3627991676330566, |
|
"objective/train/theoretical_loss": 3.3181089294769563, |
|
"objective/train/tokens_used": 61418976, |
|
"theoretical_loss": 3.3181089294769563, |
|
"tokens_seen": 3031367680 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043830196749358426, |
|
"loss": 2.6984, |
|
"theoretical_loss": 3.3181089294769563, |
|
"tokens_seen": 3031367680 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004381950384944397, |
|
"loss": 2.7248, |
|
"theoretical_loss": 3.3181034257202526, |
|
"tokens_seen": 3031433216 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043808810949529515, |
|
"loss": 2.6676, |
|
"theoretical_loss": 3.318097922115847, |
|
"tokens_seen": 3031498752 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043798118049615057, |
|
"loss": 2.4558, |
|
"theoretical_loss": 3.318092418663732, |
|
"tokens_seen": 3031564288 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.000437874251497006, |
|
"loss": 2.3336, |
|
"theoretical_loss": 3.318086915363901, |
|
"tokens_seen": 3031629824 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043776732249786146, |
|
"loss": 2.5785, |
|
"theoretical_loss": 3.3180814122163453, |
|
"tokens_seen": 3031695360 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004376603934987168, |
|
"loss": 2.3364, |
|
"theoretical_loss": 3.318075909221058, |
|
"tokens_seen": 3031760896 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004375534644995723, |
|
"loss": 2.587, |
|
"theoretical_loss": 3.3180704063780313, |
|
"tokens_seen": 3031826432 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004374465355004277, |
|
"loss": 2.4558, |
|
"theoretical_loss": 3.3180649036872585, |
|
"tokens_seen": 3031891968 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.0004373396065012832, |
|
"loss": 2.3549, |
|
"theoretical_loss": 3.318059401148731, |
|
"tokens_seen": 3031957504 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043723267750213856, |
|
"loss": 2.5263, |
|
"theoretical_loss": 3.3180538987624417, |
|
"tokens_seen": 3032023040 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043712574850299403, |
|
"loss": 2.5736, |
|
"theoretical_loss": 3.3180483965283836, |
|
"tokens_seen": 3032088576 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 0.00043701881950384945, |
|
"loss": 2.7003, |
|
"theoretical_loss": 3.3180428944465485, |
|
"tokens_seen": 3032154112 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043691189050470487, |
|
"loss": 2.654, |
|
"theoretical_loss": 3.318037392516929, |
|
"tokens_seen": 3032219648 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043680496150556034, |
|
"loss": 2.6114, |
|
"theoretical_loss": 3.318031890739518, |
|
"tokens_seen": 3032285184 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043669803250641576, |
|
"loss": 2.5595, |
|
"theoretical_loss": 3.318026389114308, |
|
"tokens_seen": 3032350720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004365911035072712, |
|
"loss": 2.7492, |
|
"theoretical_loss": 3.318020887641291, |
|
"tokens_seen": 3032416256 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004364841745081266, |
|
"loss": 2.5468, |
|
"theoretical_loss": 3.3180153863204596, |
|
"tokens_seen": 3032481792 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043637724550898207, |
|
"loss": 2.7061, |
|
"theoretical_loss": 3.318009885151807, |
|
"tokens_seen": 3032547328 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043627031650983743, |
|
"loss": 2.5121, |
|
"theoretical_loss": 3.3180043841353246, |
|
"tokens_seen": 3032612864 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004361633875106929, |
|
"loss": 2.5887, |
|
"theoretical_loss": 3.317998883271006, |
|
"tokens_seen": 3032678400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004360564585115483, |
|
"loss": 2.5448, |
|
"theoretical_loss": 3.317993382558843, |
|
"tokens_seen": 3032743936 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004359495295124038, |
|
"loss": 2.506, |
|
"theoretical_loss": 3.317987881998828, |
|
"tokens_seen": 3032809472 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004358426005132592, |
|
"loss": 2.5888, |
|
"theoretical_loss": 3.317982381590954, |
|
"tokens_seen": 3032875008 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043573567151411463, |
|
"loss": 2.6217, |
|
"theoretical_loss": 3.3179768813352135, |
|
"tokens_seen": 3032940544 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"objective/train/docs_used": 1663221, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.339816093444824, |
|
"objective/train/theoretical_loss": 3.3179713812315983, |
|
"objective/train/tokens_used": 63057376, |
|
"theoretical_loss": 3.3179713812315983, |
|
"tokens_seen": 3033006080 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004356287425149701, |
|
"loss": 2.6192, |
|
"theoretical_loss": 3.3179713812315983, |
|
"tokens_seen": 3033006080 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043552181351582547, |
|
"loss": 2.4596, |
|
"theoretical_loss": 3.317965881280102, |
|
"tokens_seen": 3033071616 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043541488451668094, |
|
"loss": 2.3786, |
|
"theoretical_loss": 3.317960381480716, |
|
"tokens_seen": 3033137152 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043530795551753636, |
|
"loss": 2.3844, |
|
"theoretical_loss": 3.3179548818334337, |
|
"tokens_seen": 3033202688 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004352010265183918, |
|
"loss": 2.6781, |
|
"theoretical_loss": 3.317949382338247, |
|
"tokens_seen": 3033268224 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004350940975192472, |
|
"loss": 2.6249, |
|
"theoretical_loss": 3.3179438829951486, |
|
"tokens_seen": 3033333760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043498716852010267, |
|
"loss": 2.7442, |
|
"theoretical_loss": 3.3179383838041314, |
|
"tokens_seen": 3033399296 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004348802395209581, |
|
"loss": 2.5186, |
|
"theoretical_loss": 3.317932884765187, |
|
"tokens_seen": 3033464832 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004347733105218135, |
|
"loss": 2.7132, |
|
"theoretical_loss": 3.317927385878309, |
|
"tokens_seen": 3033530368 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.000434666381522669, |
|
"loss": 2.6596, |
|
"theoretical_loss": 3.317921887143489, |
|
"tokens_seen": 3033595904 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004345594525235244, |
|
"loss": 2.5732, |
|
"theoretical_loss": 3.31791638856072, |
|
"tokens_seen": 3033661440 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004344525235243798, |
|
"loss": 2.6813, |
|
"theoretical_loss": 3.3179108901299945, |
|
"tokens_seen": 3033726976 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043434559452523524, |
|
"loss": 2.6586, |
|
"theoretical_loss": 3.317905391851305, |
|
"tokens_seen": 3033792512 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004342386655260907, |
|
"loss": 2.4702, |
|
"theoretical_loss": 3.317899893724644, |
|
"tokens_seen": 3033858048 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004341317365269461, |
|
"loss": 2.6286, |
|
"theoretical_loss": 3.3178943957500033, |
|
"tokens_seen": 3033923584 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043402480752780155, |
|
"loss": 2.6536, |
|
"theoretical_loss": 3.3178888979273764, |
|
"tokens_seen": 3033989120 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.000433917878528657, |
|
"loss": 2.4538, |
|
"theoretical_loss": 3.317883400256756, |
|
"tokens_seen": 3034054656 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004338109495295124, |
|
"loss": 2.4945, |
|
"theoretical_loss": 3.3178779027381333, |
|
"tokens_seen": 3034120192 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043370402053036786, |
|
"loss": 2.5167, |
|
"theoretical_loss": 3.3178724053715016, |
|
"tokens_seen": 3034185728 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004335970915312233, |
|
"loss": 2.7934, |
|
"theoretical_loss": 3.3178669081568537, |
|
"tokens_seen": 3034251264 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043349016253207875, |
|
"loss": 2.3483, |
|
"theoretical_loss": 3.3178614110941815, |
|
"tokens_seen": 3034316800 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004333832335329341, |
|
"loss": 2.384, |
|
"theoretical_loss": 3.317855914183478, |
|
"tokens_seen": 3034382336 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004332763045337896, |
|
"loss": 2.6092, |
|
"theoretical_loss": 3.3178504174247356, |
|
"tokens_seen": 3034447872 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.000433169375534645, |
|
"loss": 2.8584, |
|
"theoretical_loss": 3.317844920817947, |
|
"tokens_seen": 3034513408 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004330624465355004, |
|
"loss": 2.6498, |
|
"theoretical_loss": 3.317839424363104, |
|
"tokens_seen": 3034578944 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"objective/train/docs_used": 1664335, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.460336923599243, |
|
"objective/train/theoretical_loss": 3.3178339280602, |
|
"objective/train/tokens_used": 64695776, |
|
"theoretical_loss": 3.3178339280602, |
|
"tokens_seen": 3034644480 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043295551753635584, |
|
"loss": 2.4474, |
|
"theoretical_loss": 3.3178339280602, |
|
"tokens_seen": 3034644480 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004328485885372113, |
|
"loss": 2.5976, |
|
"theoretical_loss": 3.317828431909227, |
|
"tokens_seen": 3034710016 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043274165953806673, |
|
"loss": 2.2452, |
|
"theoretical_loss": 3.3178229359101774, |
|
"tokens_seen": 3034775552 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043263473053892215, |
|
"loss": 2.5553, |
|
"theoretical_loss": 3.3178174400630445, |
|
"tokens_seen": 3034841088 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004325278015397776, |
|
"loss": 2.4949, |
|
"theoretical_loss": 3.31781194436782, |
|
"tokens_seen": 3034906624 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.000432420872540633, |
|
"loss": 2.7255, |
|
"theoretical_loss": 3.3178064488244967, |
|
"tokens_seen": 3034972160 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043231394354148846, |
|
"loss": 2.4908, |
|
"theoretical_loss": 3.317800953433067, |
|
"tokens_seen": 3035037696 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004322070145423439, |
|
"loss": 2.7668, |
|
"theoretical_loss": 3.3177954581935234, |
|
"tokens_seen": 3035103232 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.00043210008554319935, |
|
"loss": 2.6495, |
|
"theoretical_loss": 3.317789963105859, |
|
"tokens_seen": 3035168768 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 0.0004319931565440547, |
|
"loss": 2.6374, |
|
"theoretical_loss": 3.317784468170066, |
|
"tokens_seen": 3035234304 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004318862275449102, |
|
"loss": 2.5718, |
|
"theoretical_loss": 3.3177789733861363, |
|
"tokens_seen": 3035299840 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043177929854576567, |
|
"loss": 2.6787, |
|
"theoretical_loss": 3.317773478754063, |
|
"tokens_seen": 3035365376 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043167236954662103, |
|
"loss": 2.6101, |
|
"theoretical_loss": 3.3177679842738383, |
|
"tokens_seen": 3035430912 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004315654405474765, |
|
"loss": 2.7885, |
|
"theoretical_loss": 3.317762489945456, |
|
"tokens_seen": 3035496448 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004314585115483319, |
|
"loss": 2.4024, |
|
"theoretical_loss": 3.3177569957689066, |
|
"tokens_seen": 3035561984 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043135158254918734, |
|
"loss": 2.5602, |
|
"theoretical_loss": 3.3177515017441843, |
|
"tokens_seen": 3035627520 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043124465355004276, |
|
"loss": 2.7748, |
|
"theoretical_loss": 3.3177460078712806, |
|
"tokens_seen": 3035693056 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043113772455089823, |
|
"loss": 2.5992, |
|
"theoretical_loss": 3.3177405141501883, |
|
"tokens_seen": 3035758592 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004310307955517536, |
|
"loss": 2.6069, |
|
"theoretical_loss": 3.3177350205809004, |
|
"tokens_seen": 3035824128 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043092386655260907, |
|
"loss": 2.5898, |
|
"theoretical_loss": 3.317729527163409, |
|
"tokens_seen": 3035889664 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043081693755346454, |
|
"loss": 2.7153, |
|
"theoretical_loss": 3.317724033897706, |
|
"tokens_seen": 3035955200 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043071000855431996, |
|
"loss": 2.6276, |
|
"theoretical_loss": 3.3177185407837855, |
|
"tokens_seen": 3036020736 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004306030795551754, |
|
"loss": 2.5747, |
|
"theoretical_loss": 3.3177130478216386, |
|
"tokens_seen": 3036086272 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004304961505560308, |
|
"loss": 2.921, |
|
"theoretical_loss": 3.3177075550112587, |
|
"tokens_seen": 3036151808 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043038922155688627, |
|
"loss": 2.4716, |
|
"theoretical_loss": 3.3177020623526374, |
|
"tokens_seen": 3036217344 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"objective/train/docs_used": 1665364, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.702648878097534, |
|
"objective/train/theoretical_loss": 3.3176965698457686, |
|
"objective/train/tokens_used": 66334176, |
|
"theoretical_loss": 3.3176965698457686, |
|
"tokens_seen": 3036282880 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00043028229255774163, |
|
"loss": 2.8016, |
|
"theoretical_loss": 3.3176965698457686, |
|
"tokens_seen": 3036282880 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004301753635585971, |
|
"loss": 2.6286, |
|
"theoretical_loss": 3.3176910774906436, |
|
"tokens_seen": 3036348416 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004300684345594525, |
|
"loss": 2.5836, |
|
"theoretical_loss": 3.3176855852872555, |
|
"tokens_seen": 3036413952 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042996150556030794, |
|
"loss": 2.7408, |
|
"theoretical_loss": 3.3176800932355968, |
|
"tokens_seen": 3036479488 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004298545765611634, |
|
"loss": 2.5826, |
|
"theoretical_loss": 3.3176746013356597, |
|
"tokens_seen": 3036545024 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042974764756201884, |
|
"loss": 2.258, |
|
"theoretical_loss": 3.3176691095874373, |
|
"tokens_seen": 3036610560 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042964071856287425, |
|
"loss": 2.5244, |
|
"theoretical_loss": 3.317663617990922, |
|
"tokens_seen": 3036676096 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004295337895637297, |
|
"loss": 2.4518, |
|
"theoretical_loss": 3.317658126546106, |
|
"tokens_seen": 3036741632 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042942686056458515, |
|
"loss": 2.6188, |
|
"theoretical_loss": 3.3176526352529816, |
|
"tokens_seen": 3036807168 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042931993156544056, |
|
"loss": 2.5812, |
|
"theoretical_loss": 3.3176471441115423, |
|
"tokens_seen": 3036872704 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.000429213002566296, |
|
"loss": 2.5719, |
|
"theoretical_loss": 3.31764165312178, |
|
"tokens_seen": 3036938240 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004291060735671514, |
|
"loss": 2.6047, |
|
"theoretical_loss": 3.317636162283687, |
|
"tokens_seen": 3037003776 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004289991445680069, |
|
"loss": 2.4401, |
|
"theoretical_loss": 3.3176306715972563, |
|
"tokens_seen": 3037069312 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042889221556886224, |
|
"loss": 2.5756, |
|
"theoretical_loss": 3.31762518106248, |
|
"tokens_seen": 3037134848 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004287852865697177, |
|
"loss": 2.5878, |
|
"theoretical_loss": 3.3176196906793516, |
|
"tokens_seen": 3037200384 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004286783575705732, |
|
"loss": 2.5203, |
|
"theoretical_loss": 3.317614200447862, |
|
"tokens_seen": 3037265920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042857142857142855, |
|
"loss": 2.4597, |
|
"theoretical_loss": 3.3176087103680056, |
|
"tokens_seen": 3037331456 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.000428464499572284, |
|
"loss": 2.4774, |
|
"theoretical_loss": 3.3176032204397736, |
|
"tokens_seen": 3037396992 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042835757057313944, |
|
"loss": 2.6599, |
|
"theoretical_loss": 3.3175977306631594, |
|
"tokens_seen": 3037462528 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004282506415739949, |
|
"loss": 2.6216, |
|
"theoretical_loss": 3.3175922410381546, |
|
"tokens_seen": 3037528064 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004281437125748503, |
|
"loss": 2.6732, |
|
"theoretical_loss": 3.3175867515647526, |
|
"tokens_seen": 3037593600 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042803678357570575, |
|
"loss": 2.8955, |
|
"theoretical_loss": 3.3175812622429453, |
|
"tokens_seen": 3037659136 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042792985457656117, |
|
"loss": 2.6119, |
|
"theoretical_loss": 3.317575773072726, |
|
"tokens_seen": 3037724672 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004278229255774166, |
|
"loss": 2.4313, |
|
"theoretical_loss": 3.317570284054087, |
|
"tokens_seen": 3037790208 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042771599657827206, |
|
"loss": 2.4777, |
|
"theoretical_loss": 3.3175647951870197, |
|
"tokens_seen": 3037855744 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"objective/train/docs_used": 1665791, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.440596342086792, |
|
"objective/train/theoretical_loss": 3.317559306471518, |
|
"objective/train/tokens_used": 67972576, |
|
"theoretical_loss": 3.317559306471518, |
|
"tokens_seen": 3037921280 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004276090675791275, |
|
"loss": 2.4501, |
|
"theoretical_loss": 3.317559306471518, |
|
"tokens_seen": 3037921280 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004275021385799829, |
|
"loss": 2.3919, |
|
"theoretical_loss": 3.3175538179075743, |
|
"tokens_seen": 3037986816 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004273952095808383, |
|
"loss": 2.683, |
|
"theoretical_loss": 3.317548329495181, |
|
"tokens_seen": 3038052352 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004272882805816938, |
|
"loss": 2.6477, |
|
"theoretical_loss": 3.31754284123433, |
|
"tokens_seen": 3038117888 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042718135158254915, |
|
"loss": 2.5088, |
|
"theoretical_loss": 3.3175373531250147, |
|
"tokens_seen": 3038183424 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042707442258340463, |
|
"loss": 2.6919, |
|
"theoretical_loss": 3.3175318651672274, |
|
"tokens_seen": 3038248960 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.00042696749358426005, |
|
"loss": 2.3505, |
|
"theoretical_loss": 3.3175263773609602, |
|
"tokens_seen": 3038314496 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 0.0004268605645851155, |
|
"loss": 2.4019, |
|
"theoretical_loss": 3.3175208897062065, |
|
"tokens_seen": 3038380032 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042675363558597094, |
|
"loss": 2.5938, |
|
"theoretical_loss": 3.3175154022029583, |
|
"tokens_seen": 3038445568 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042664670658682636, |
|
"loss": 2.7483, |
|
"theoretical_loss": 3.317509914851208, |
|
"tokens_seen": 3038511104 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042653977758768183, |
|
"loss": 2.5649, |
|
"theoretical_loss": 3.317504427650948, |
|
"tokens_seen": 3038576640 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004264328485885372, |
|
"loss": 2.7034, |
|
"theoretical_loss": 3.3174989406021718, |
|
"tokens_seen": 3038642176 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042632591958939267, |
|
"loss": 2.5488, |
|
"theoretical_loss": 3.3174934537048713, |
|
"tokens_seen": 3038707712 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004262189905902481, |
|
"loss": 2.5863, |
|
"theoretical_loss": 3.317487966959039, |
|
"tokens_seen": 3038773248 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004261120615911035, |
|
"loss": 2.678, |
|
"theoretical_loss": 3.317482480364667, |
|
"tokens_seen": 3038838784 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004260051325919589, |
|
"loss": 2.4831, |
|
"theoretical_loss": 3.3174769939217494, |
|
"tokens_seen": 3038904320 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004258982035928144, |
|
"loss": 2.5811, |
|
"theoretical_loss": 3.3174715076302776, |
|
"tokens_seen": 3038969856 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042579127459366976, |
|
"loss": 2.5055, |
|
"theoretical_loss": 3.317466021490244, |
|
"tokens_seen": 3039035392 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042568434559452523, |
|
"loss": 2.7085, |
|
"theoretical_loss": 3.3174605355016418, |
|
"tokens_seen": 3039100928 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004255774165953807, |
|
"loss": 2.2972, |
|
"theoretical_loss": 3.3174550496644626, |
|
"tokens_seen": 3039166464 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004254704875962361, |
|
"loss": 2.7515, |
|
"theoretical_loss": 3.3174495639787005, |
|
"tokens_seen": 3039232000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042536355859709154, |
|
"loss": 2.449, |
|
"theoretical_loss": 3.317444078444346, |
|
"tokens_seen": 3039297536 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042525662959794696, |
|
"loss": 2.7037, |
|
"theoretical_loss": 3.317438593061394, |
|
"tokens_seen": 3039363072 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042514970059880243, |
|
"loss": 2.6908, |
|
"theoretical_loss": 3.317433107829835, |
|
"tokens_seen": 3039428608 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004250427715996578, |
|
"loss": 2.461, |
|
"theoretical_loss": 3.3174276227496633, |
|
"tokens_seen": 3039494144 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"objective/train/docs_used": 1666356, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.657942295074463, |
|
"objective/train/theoretical_loss": 3.31742213782087, |
|
"objective/train/tokens_used": 69610976, |
|
"theoretical_loss": 3.31742213782087, |
|
"tokens_seen": 3039559680 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042493584260051327, |
|
"loss": 2.6056, |
|
"theoretical_loss": 3.31742213782087, |
|
"tokens_seen": 3039559680 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004248289136013687, |
|
"loss": 2.5217, |
|
"theoretical_loss": 3.3174166530434483, |
|
"tokens_seen": 3039625216 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004247219846022241, |
|
"loss": 2.6368, |
|
"theoretical_loss": 3.3174111684173906, |
|
"tokens_seen": 3039690752 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004246150556030796, |
|
"loss": 2.4843, |
|
"theoretical_loss": 3.3174056839426895, |
|
"tokens_seen": 3039756288 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.000424508126603935, |
|
"loss": 2.671, |
|
"theoretical_loss": 3.317400199619338, |
|
"tokens_seen": 3039821824 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004244011976047904, |
|
"loss": 2.4733, |
|
"theoretical_loss": 3.3173947154473282, |
|
"tokens_seen": 3039887360 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042429426860564584, |
|
"loss": 2.688, |
|
"theoretical_loss": 3.3173892314266524, |
|
"tokens_seen": 3039952896 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004241873396065013, |
|
"loss": 2.6233, |
|
"theoretical_loss": 3.3173837475573036, |
|
"tokens_seen": 3040018432 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042408041060735673, |
|
"loss": 2.6475, |
|
"theoretical_loss": 3.3173782638392746, |
|
"tokens_seen": 3040083968 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042397348160821215, |
|
"loss": 2.635, |
|
"theoretical_loss": 3.3173727802725574, |
|
"tokens_seen": 3040149504 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042386655260906757, |
|
"loss": 2.5467, |
|
"theoretical_loss": 3.3173672968571446, |
|
"tokens_seen": 3040215040 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042375962360992304, |
|
"loss": 2.5554, |
|
"theoretical_loss": 3.3173618135930294, |
|
"tokens_seen": 3040280576 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042365269461077846, |
|
"loss": 2.5479, |
|
"theoretical_loss": 3.3173563304802034, |
|
"tokens_seen": 3040346112 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004235457656116339, |
|
"loss": 2.4553, |
|
"theoretical_loss": 3.31735084751866, |
|
"tokens_seen": 3040411648 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042343883661248935, |
|
"loss": 2.6328, |
|
"theoretical_loss": 3.3173453647083915, |
|
"tokens_seen": 3040477184 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004233319076133447, |
|
"loss": 2.6439, |
|
"theoretical_loss": 3.31733988204939, |
|
"tokens_seen": 3040542720 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004232249786142002, |
|
"loss": 2.7004, |
|
"theoretical_loss": 3.317334399541649, |
|
"tokens_seen": 3040608256 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004231180496150556, |
|
"loss": 2.3918, |
|
"theoretical_loss": 3.31732891718516, |
|
"tokens_seen": 3040673792 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004230111206159111, |
|
"loss": 2.7061, |
|
"theoretical_loss": 3.3173234349799166, |
|
"tokens_seen": 3040739328 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042290419161676644, |
|
"loss": 2.5726, |
|
"theoretical_loss": 3.3173179529259107, |
|
"tokens_seen": 3040804864 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004227972626176219, |
|
"loss": 2.3567, |
|
"theoretical_loss": 3.317312471023135, |
|
"tokens_seen": 3040870400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004226903336184774, |
|
"loss": 2.4497, |
|
"theoretical_loss": 3.3173069892715823, |
|
"tokens_seen": 3040935936 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042258340461933275, |
|
"loss": 2.5488, |
|
"theoretical_loss": 3.3173015076712447, |
|
"tokens_seen": 3041001472 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004224764756201882, |
|
"loss": 2.5573, |
|
"theoretical_loss": 3.3172960262221154, |
|
"tokens_seen": 3041067008 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042236954662104364, |
|
"loss": 2.5725, |
|
"theoretical_loss": 3.3172905449241865, |
|
"tokens_seen": 3041132544 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"objective/train/docs_used": 1667402, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.76525616645813, |
|
"objective/train/theoretical_loss": 3.317285063777451, |
|
"objective/train/tokens_used": 71249376, |
|
"theoretical_loss": 3.317285063777451, |
|
"tokens_seen": 3041198080 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042226261762189906, |
|
"loss": 2.7626, |
|
"theoretical_loss": 3.317285063777451, |
|
"tokens_seen": 3041198080 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004221556886227545, |
|
"loss": 2.5848, |
|
"theoretical_loss": 3.3172795827819, |
|
"tokens_seen": 3041263616 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.00042204875962360995, |
|
"loss": 2.6828, |
|
"theoretical_loss": 3.3172741019375285, |
|
"tokens_seen": 3041329152 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004219418306244653, |
|
"loss": 2.4074, |
|
"theoretical_loss": 3.3172686212443274, |
|
"tokens_seen": 3041394688 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"learning_rate": 0.0004218349016253208, |
|
"loss": 2.8892, |
|
"theoretical_loss": 3.3172631407022894, |
|
"tokens_seen": 3041460224 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042172797262617626, |
|
"loss": 2.7005, |
|
"theoretical_loss": 3.317257660311408, |
|
"tokens_seen": 3041525760 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004216210436270317, |
|
"loss": 2.559, |
|
"theoretical_loss": 3.3172521800716748, |
|
"tokens_seen": 3041591296 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004215141146278871, |
|
"loss": 2.6013, |
|
"theoretical_loss": 3.3172466999830825, |
|
"tokens_seen": 3041656832 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004214071856287425, |
|
"loss": 2.6889, |
|
"theoretical_loss": 3.317241220045624, |
|
"tokens_seen": 3041722368 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.000421300256629598, |
|
"loss": 2.7481, |
|
"theoretical_loss": 3.317235740259292, |
|
"tokens_seen": 3041787904 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042119332763045336, |
|
"loss": 2.8443, |
|
"theoretical_loss": 3.3172302606240787, |
|
"tokens_seen": 3041853440 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042108639863130883, |
|
"loss": 2.608, |
|
"theoretical_loss": 3.3172247811399767, |
|
"tokens_seen": 3041918976 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042097946963216425, |
|
"loss": 2.6413, |
|
"theoretical_loss": 3.3172193018069787, |
|
"tokens_seen": 3041984512 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042087254063301967, |
|
"loss": 2.7409, |
|
"theoretical_loss": 3.317213822625077, |
|
"tokens_seen": 3042050048 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004207656116338751, |
|
"loss": 2.5192, |
|
"theoretical_loss": 3.317208343594265, |
|
"tokens_seen": 3042115584 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042065868263473056, |
|
"loss": 2.5256, |
|
"theoretical_loss": 3.3172028647145346, |
|
"tokens_seen": 3042181120 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.000420551753635586, |
|
"loss": 2.9425, |
|
"theoretical_loss": 3.3171973859858785, |
|
"tokens_seen": 3042246656 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004204448246364414, |
|
"loss": 2.6551, |
|
"theoretical_loss": 3.317191907408289, |
|
"tokens_seen": 3042312192 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00042033789563729687, |
|
"loss": 2.5016, |
|
"theoretical_loss": 3.3171864289817594, |
|
"tokens_seen": 3042377728 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004202309666381523, |
|
"loss": 2.4924, |
|
"theoretical_loss": 3.3171809507062817, |
|
"tokens_seen": 3042443264 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004201240376390077, |
|
"loss": 2.4651, |
|
"theoretical_loss": 3.3171754725818485, |
|
"tokens_seen": 3042508800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004200171086398631, |
|
"loss": 2.6467, |
|
"theoretical_loss": 3.3171699946084523, |
|
"tokens_seen": 3042574336 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004199101796407186, |
|
"loss": 2.7134, |
|
"theoretical_loss": 3.3171645167860864, |
|
"tokens_seen": 3042639872 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041980325064157396, |
|
"loss": 2.5778, |
|
"theoretical_loss": 3.3171590391147427, |
|
"tokens_seen": 3042705408 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041969632164242943, |
|
"loss": 2.6097, |
|
"theoretical_loss": 3.317153561594414, |
|
"tokens_seen": 3042770944 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"objective/train/docs_used": 1668521, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.784074306488037, |
|
"objective/train/theoretical_loss": 3.3171480842250927, |
|
"objective/train/tokens_used": 72887776, |
|
"theoretical_loss": 3.3171480842250927, |
|
"tokens_seen": 3042836480 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004195893926432849, |
|
"loss": 2.4201, |
|
"theoretical_loss": 3.3171480842250927, |
|
"tokens_seen": 3042836480 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041948246364414027, |
|
"loss": 2.4407, |
|
"theoretical_loss": 3.3171426070067715, |
|
"tokens_seen": 3042902016 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041937553464499574, |
|
"loss": 2.4949, |
|
"theoretical_loss": 3.317137129939443, |
|
"tokens_seen": 3042967552 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041926860564585116, |
|
"loss": 2.5337, |
|
"theoretical_loss": 3.3171316530231003, |
|
"tokens_seen": 3043033088 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004191616766467066, |
|
"loss": 2.7346, |
|
"theoretical_loss": 3.3171261762577346, |
|
"tokens_seen": 3043098624 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.000419054747647562, |
|
"loss": 2.6234, |
|
"theoretical_loss": 3.31712069964334, |
|
"tokens_seen": 3043164160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004189478186484175, |
|
"loss": 2.7589, |
|
"theoretical_loss": 3.3171152231799086, |
|
"tokens_seen": 3043229696 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004188408896492729, |
|
"loss": 2.5401, |
|
"theoretical_loss": 3.317109746867432, |
|
"tokens_seen": 3043295232 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004187339606501283, |
|
"loss": 2.59, |
|
"theoretical_loss": 3.317104270705905, |
|
"tokens_seen": 3043360768 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004186270316509838, |
|
"loss": 2.7252, |
|
"theoretical_loss": 3.3170987946953177, |
|
"tokens_seen": 3043426304 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004185201026518392, |
|
"loss": 2.6156, |
|
"theoretical_loss": 3.3170933188356644, |
|
"tokens_seen": 3043491840 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004184131736526946, |
|
"loss": 2.7091, |
|
"theoretical_loss": 3.3170878431269366, |
|
"tokens_seen": 3043557376 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041830624465355004, |
|
"loss": 2.6037, |
|
"theoretical_loss": 3.3170823675691277, |
|
"tokens_seen": 3043622912 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004181993156544055, |
|
"loss": 2.4912, |
|
"theoretical_loss": 3.31707689216223, |
|
"tokens_seen": 3043688448 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004180923866552609, |
|
"loss": 2.6153, |
|
"theoretical_loss": 3.317071416906236, |
|
"tokens_seen": 3043753984 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041798545765611635, |
|
"loss": 2.6945, |
|
"theoretical_loss": 3.317065941801139, |
|
"tokens_seen": 3043819520 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041787852865697177, |
|
"loss": 2.6036, |
|
"theoretical_loss": 3.3170604668469297, |
|
"tokens_seen": 3043885056 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004177715996578272, |
|
"loss": 2.6957, |
|
"theoretical_loss": 3.317054992043603, |
|
"tokens_seen": 3043950592 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041766467065868266, |
|
"loss": 2.6709, |
|
"theoretical_loss": 3.31704951739115, |
|
"tokens_seen": 3044016128 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004175577416595381, |
|
"loss": 2.7614, |
|
"theoretical_loss": 3.317044042889564, |
|
"tokens_seen": 3044081664 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041745081266039355, |
|
"loss": 2.4733, |
|
"theoretical_loss": 3.317038568538837, |
|
"tokens_seen": 3044147200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004173438836612489, |
|
"loss": 2.5863, |
|
"theoretical_loss": 3.317033094338962, |
|
"tokens_seen": 3044212736 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004172369546621044, |
|
"loss": 2.565, |
|
"theoretical_loss": 3.317027620289932, |
|
"tokens_seen": 3044278272 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004171300256629598, |
|
"loss": 2.7164, |
|
"theoretical_loss": 3.3170221463917384, |
|
"tokens_seen": 3044343808 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004170230966638152, |
|
"loss": 2.6085, |
|
"theoretical_loss": 3.317016672644375, |
|
"tokens_seen": 3044409344 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"objective/train/docs_used": 1668971, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.9621496200561523, |
|
"objective/train/theoretical_loss": 3.3170111990478337, |
|
"objective/train/tokens_used": 74526176, |
|
"theoretical_loss": 3.3170111990478337, |
|
"tokens_seen": 3044474880 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.00041691616766467064, |
|
"loss": 2.9867, |
|
"theoretical_loss": 3.3170111990478337, |
|
"tokens_seen": 3044474880 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 0.0004168092386655261, |
|
"loss": 2.6062, |
|
"theoretical_loss": 3.3170057256021077, |
|
"tokens_seen": 3044540416 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004167023096663815, |
|
"loss": 2.5778, |
|
"theoretical_loss": 3.3170002523071886, |
|
"tokens_seen": 3044605952 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041659538066723695, |
|
"loss": 2.5364, |
|
"theoretical_loss": 3.3169947791630703, |
|
"tokens_seen": 3044671488 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041648845166809243, |
|
"loss": 2.4894, |
|
"theoretical_loss": 3.316989306169744, |
|
"tokens_seen": 3044737024 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041638152266894785, |
|
"loss": 2.7874, |
|
"theoretical_loss": 3.3169838333272037, |
|
"tokens_seen": 3044802560 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041627459366980326, |
|
"loss": 2.6971, |
|
"theoretical_loss": 3.3169783606354413, |
|
"tokens_seen": 3044868096 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004161676646706587, |
|
"loss": 2.7493, |
|
"theoretical_loss": 3.316972888094449, |
|
"tokens_seen": 3044933632 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041606073567151416, |
|
"loss": 2.6092, |
|
"theoretical_loss": 3.3169674157042195, |
|
"tokens_seen": 3044999168 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004159538066723695, |
|
"loss": 2.4757, |
|
"theoretical_loss": 3.3169619434647464, |
|
"tokens_seen": 3045064704 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.000415846877673225, |
|
"loss": 2.7305, |
|
"theoretical_loss": 3.3169564713760216, |
|
"tokens_seen": 3045130240 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004157399486740804, |
|
"loss": 2.5703, |
|
"theoretical_loss": 3.3169509994380375, |
|
"tokens_seen": 3045195776 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041563301967493583, |
|
"loss": 2.5121, |
|
"theoretical_loss": 3.3169455276507867, |
|
"tokens_seen": 3045261312 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004155260906757913, |
|
"loss": 2.6273, |
|
"theoretical_loss": 3.3169400560142623, |
|
"tokens_seen": 3045326848 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004154191616766467, |
|
"loss": 2.6654, |
|
"theoretical_loss": 3.3169345845284566, |
|
"tokens_seen": 3045392384 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041531223267750214, |
|
"loss": 2.5341, |
|
"theoretical_loss": 3.3169291131933623, |
|
"tokens_seen": 3045457920 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041520530367835756, |
|
"loss": 2.5806, |
|
"theoretical_loss": 3.316923642008972, |
|
"tokens_seen": 3045523456 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041509837467921303, |
|
"loss": 2.598, |
|
"theoretical_loss": 3.316918170975278, |
|
"tokens_seen": 3045588992 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041499144568006845, |
|
"loss": 2.6477, |
|
"theoretical_loss": 3.3169127000922733, |
|
"tokens_seen": 3045654528 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041488451668092387, |
|
"loss": 2.6531, |
|
"theoretical_loss": 3.31690722935995, |
|
"tokens_seen": 3045720064 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004147775876817793, |
|
"loss": 2.4507, |
|
"theoretical_loss": 3.316901758778302, |
|
"tokens_seen": 3045785600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041467065868263476, |
|
"loss": 2.5672, |
|
"theoretical_loss": 3.3168962883473205, |
|
"tokens_seen": 3045851136 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004145637296834902, |
|
"loss": 2.5622, |
|
"theoretical_loss": 3.3168908180669985, |
|
"tokens_seen": 3045916672 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004144568006843456, |
|
"loss": 2.7632, |
|
"theoretical_loss": 3.316885347937329, |
|
"tokens_seen": 3045982208 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041434987168520107, |
|
"loss": 2.7035, |
|
"theoretical_loss": 3.3168798779583035, |
|
"tokens_seen": 3046047744 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"objective/train/docs_used": 1670023, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.6770851612091064, |
|
"objective/train/theoretical_loss": 3.316874408129916, |
|
"objective/train/tokens_used": 76164576, |
|
"theoretical_loss": 3.316874408129916, |
|
"tokens_seen": 3046113280 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041424294268605644, |
|
"loss": 2.6328, |
|
"theoretical_loss": 3.316874408129916, |
|
"tokens_seen": 3046113280 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004141360136869119, |
|
"loss": 2.5711, |
|
"theoretical_loss": 3.3168689384521586, |
|
"tokens_seen": 3046178816 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004140290846877673, |
|
"loss": 2.5709, |
|
"theoretical_loss": 3.316863468925024, |
|
"tokens_seen": 3046244352 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041392215568862275, |
|
"loss": 2.5086, |
|
"theoretical_loss": 3.316857999548504, |
|
"tokens_seen": 3046309888 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041381522668947816, |
|
"loss": 2.7012, |
|
"theoretical_loss": 3.3168525303225924, |
|
"tokens_seen": 3046375424 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041370829769033364, |
|
"loss": 2.7311, |
|
"theoretical_loss": 3.316847061247281, |
|
"tokens_seen": 3046440960 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004136013686911891, |
|
"loss": 2.4887, |
|
"theoretical_loss": 3.316841592322563, |
|
"tokens_seen": 3046506496 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004134944396920445, |
|
"loss": 2.6398, |
|
"theoretical_loss": 3.3168361235484305, |
|
"tokens_seen": 3046572032 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041338751069289995, |
|
"loss": 2.8039, |
|
"theoretical_loss": 3.3168306549248765, |
|
"tokens_seen": 3046637568 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041328058169375537, |
|
"loss": 2.6058, |
|
"theoretical_loss": 3.3168251864518936, |
|
"tokens_seen": 3046703104 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004131736526946108, |
|
"loss": 2.6846, |
|
"theoretical_loss": 3.316819718129474, |
|
"tokens_seen": 3046768640 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004130667236954662, |
|
"loss": 2.6162, |
|
"theoretical_loss": 3.31681424995761, |
|
"tokens_seen": 3046834176 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004129597946963217, |
|
"loss": 2.5892, |
|
"theoretical_loss": 3.3168087819362957, |
|
"tokens_seen": 3046899712 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041285286569717704, |
|
"loss": 2.4469, |
|
"theoretical_loss": 3.316803314065522, |
|
"tokens_seen": 3046965248 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004127459366980325, |
|
"loss": 2.6708, |
|
"theoretical_loss": 3.316797846345283, |
|
"tokens_seen": 3047030784 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041263900769888793, |
|
"loss": 2.6839, |
|
"theoretical_loss": 3.3167923787755704, |
|
"tokens_seen": 3047096320 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041253207869974335, |
|
"loss": 2.7111, |
|
"theoretical_loss": 3.316786911356377, |
|
"tokens_seen": 3047161856 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004124251497005988, |
|
"loss": 2.5143, |
|
"theoretical_loss": 3.3167814440876957, |
|
"tokens_seen": 3047227392 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041231822070145424, |
|
"loss": 2.4868, |
|
"theoretical_loss": 3.316775976969519, |
|
"tokens_seen": 3047292928 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004122112917023097, |
|
"loss": 2.394, |
|
"theoretical_loss": 3.316770510001839, |
|
"tokens_seen": 3047358464 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004121043627031651, |
|
"loss": 2.6288, |
|
"theoretical_loss": 3.316765043184649, |
|
"tokens_seen": 3047424000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041199743370402055, |
|
"loss": 2.6458, |
|
"theoretical_loss": 3.316759576517941, |
|
"tokens_seen": 3047489536 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.00041189050470487597, |
|
"loss": 2.4446, |
|
"theoretical_loss": 3.316754110001708, |
|
"tokens_seen": 3047555072 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 0.0004117835757057314, |
|
"loss": 2.6992, |
|
"theoretical_loss": 3.3167486436359432, |
|
"tokens_seen": 3047620608 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004116766467065868, |
|
"loss": 2.7639, |
|
"theoretical_loss": 3.3167431774206384, |
|
"tokens_seen": 3047686144 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"objective/train/docs_used": 1670610, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.5403025150299072, |
|
"objective/train/theoretical_loss": 3.316737711355786, |
|
"objective/train/tokens_used": 77802976, |
|
"theoretical_loss": 3.316737711355786, |
|
"tokens_seen": 3047751680 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004115697177074423, |
|
"loss": 2.6853, |
|
"theoretical_loss": 3.316737711355786, |
|
"tokens_seen": 3047751680 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004114627887082977, |
|
"loss": 2.7335, |
|
"theoretical_loss": 3.3167322454413792, |
|
"tokens_seen": 3047817216 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004113558597091531, |
|
"loss": 2.6713, |
|
"theoretical_loss": 3.3167267796774107, |
|
"tokens_seen": 3047882752 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004112489307100086, |
|
"loss": 2.8081, |
|
"theoretical_loss": 3.316721314063873, |
|
"tokens_seen": 3047948288 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.000411142001710864, |
|
"loss": 2.5437, |
|
"theoretical_loss": 3.316715848600759, |
|
"tokens_seen": 3048013824 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00041103507271171943, |
|
"loss": 2.6631, |
|
"theoretical_loss": 3.3167103832880604, |
|
"tokens_seen": 3048079360 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00041092814371257485, |
|
"loss": 2.4265, |
|
"theoretical_loss": 3.3167049181257706, |
|
"tokens_seen": 3048144896 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004108212147134303, |
|
"loss": 2.4767, |
|
"theoretical_loss": 3.316699453113882, |
|
"tokens_seen": 3048210432 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004107142857142857, |
|
"loss": 2.702, |
|
"theoretical_loss": 3.316693988252387, |
|
"tokens_seen": 3048275968 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00041060735671514116, |
|
"loss": 2.5563, |
|
"theoretical_loss": 3.3166885235412784, |
|
"tokens_seen": 3048341504 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00041050042771599663, |
|
"loss": 2.7919, |
|
"theoretical_loss": 3.3166830589805496, |
|
"tokens_seen": 3048407040 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.000410393498716852, |
|
"loss": 2.5336, |
|
"theoretical_loss": 3.316677594570192, |
|
"tokens_seen": 3048472576 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00041028656971770747, |
|
"loss": 2.6564, |
|
"theoretical_loss": 3.3166721303101987, |
|
"tokens_seen": 3048538112 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004101796407185629, |
|
"loss": 2.577, |
|
"theoretical_loss": 3.316666666200563, |
|
"tokens_seen": 3048603648 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004100727117194183, |
|
"loss": 2.5261, |
|
"theoretical_loss": 3.316661202241276, |
|
"tokens_seen": 3048669184 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004099657827202737, |
|
"loss": 2.6642, |
|
"theoretical_loss": 3.316655738432332, |
|
"tokens_seen": 3048734720 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004098588537211292, |
|
"loss": 2.6357, |
|
"theoretical_loss": 3.316650274773723, |
|
"tokens_seen": 3048800256 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004097519247219846, |
|
"loss": 2.7002, |
|
"theoretical_loss": 3.3166448112654408, |
|
"tokens_seen": 3048865792 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040964499572284003, |
|
"loss": 2.5809, |
|
"theoretical_loss": 3.316639347907479, |
|
"tokens_seen": 3048931328 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004095380667236955, |
|
"loss": 2.7383, |
|
"theoretical_loss": 3.3166338846998302, |
|
"tokens_seen": 3048996864 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004094311377245509, |
|
"loss": 2.7608, |
|
"theoretical_loss": 3.316628421642487, |
|
"tokens_seen": 3049062400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040932420872540634, |
|
"loss": 2.4564, |
|
"theoretical_loss": 3.316622958735442, |
|
"tokens_seen": 3049127936 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040921727972626176, |
|
"loss": 2.5476, |
|
"theoretical_loss": 3.316617495978687, |
|
"tokens_seen": 3049193472 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040911035072711723, |
|
"loss": 2.5533, |
|
"theoretical_loss": 3.3166120333722158, |
|
"tokens_seen": 3049259008 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004090034217279726, |
|
"loss": 2.75, |
|
"theoretical_loss": 3.3166065709160204, |
|
"tokens_seen": 3049324544 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"objective/train/docs_used": 1671708, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.5090513229370117, |
|
"objective/train/theoretical_loss": 3.3166011086100937, |
|
"objective/train/tokens_used": 79441376, |
|
"theoretical_loss": 3.3166011086100937, |
|
"tokens_seen": 3049390080 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040889649272882807, |
|
"loss": 2.6123, |
|
"theoretical_loss": 3.3166011086100937, |
|
"tokens_seen": 3049390080 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004087895637296835, |
|
"loss": 2.4582, |
|
"theoretical_loss": 3.316595646454428, |
|
"tokens_seen": 3049455616 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004086826347305389, |
|
"loss": 2.4455, |
|
"theoretical_loss": 3.3165901844490167, |
|
"tokens_seen": 3049521152 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040857570573139433, |
|
"loss": 2.6832, |
|
"theoretical_loss": 3.316584722593851, |
|
"tokens_seen": 3049586688 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004084687767322498, |
|
"loss": 2.8326, |
|
"theoretical_loss": 3.3165792608889255, |
|
"tokens_seen": 3049652224 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004083618477331053, |
|
"loss": 2.6351, |
|
"theoretical_loss": 3.3165737993342312, |
|
"tokens_seen": 3049717760 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040825491873396064, |
|
"loss": 2.5201, |
|
"theoretical_loss": 3.3165683379297612, |
|
"tokens_seen": 3049783296 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004081479897348161, |
|
"loss": 2.6653, |
|
"theoretical_loss": 3.3165628766755084, |
|
"tokens_seen": 3049848832 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040804106073567153, |
|
"loss": 2.715, |
|
"theoretical_loss": 3.3165574155714657, |
|
"tokens_seen": 3049914368 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040793413173652695, |
|
"loss": 2.5546, |
|
"theoretical_loss": 3.316551954617625, |
|
"tokens_seen": 3049979904 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040782720273738237, |
|
"loss": 2.7839, |
|
"theoretical_loss": 3.3165464938139797, |
|
"tokens_seen": 3050045440 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040772027373823784, |
|
"loss": 2.4923, |
|
"theoretical_loss": 3.3165410331605214, |
|
"tokens_seen": 3050110976 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004076133447390932, |
|
"loss": 2.6646, |
|
"theoretical_loss": 3.3165355726572434, |
|
"tokens_seen": 3050176512 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004075064157399487, |
|
"loss": 2.5978, |
|
"theoretical_loss": 3.316530112304139, |
|
"tokens_seen": 3050242048 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040739948674080415, |
|
"loss": 2.8643, |
|
"theoretical_loss": 3.3165246521011995, |
|
"tokens_seen": 3050307584 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004072925577416595, |
|
"loss": 2.3827, |
|
"theoretical_loss": 3.3165191920484185, |
|
"tokens_seen": 3050373120 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.000407185628742515, |
|
"loss": 2.6466, |
|
"theoretical_loss": 3.3165137321457885, |
|
"tokens_seen": 3050438656 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004070786997433704, |
|
"loss": 2.5199, |
|
"theoretical_loss": 3.316508272393302, |
|
"tokens_seen": 3050504192 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004069717707442259, |
|
"loss": 2.5219, |
|
"theoretical_loss": 3.3165028127909513, |
|
"tokens_seen": 3050569728 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040686484174508124, |
|
"loss": 2.6259, |
|
"theoretical_loss": 3.3164973533387294, |
|
"tokens_seen": 3050635264 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.0004067579127459367, |
|
"loss": 2.607, |
|
"theoretical_loss": 3.3164918940366293, |
|
"tokens_seen": 3050700800 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 0.00040665098374679213, |
|
"loss": 2.7539, |
|
"theoretical_loss": 3.3164864348846432, |
|
"tokens_seen": 3050766336 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040654405474764755, |
|
"loss": 2.704, |
|
"theoretical_loss": 3.3164809758827634, |
|
"tokens_seen": 3050831872 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.000406437125748503, |
|
"loss": 2.7537, |
|
"theoretical_loss": 3.3164755170309834, |
|
"tokens_seen": 3050897408 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040633019674935844, |
|
"loss": 2.7319, |
|
"theoretical_loss": 3.3164700583292954, |
|
"tokens_seen": 3050962944 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"objective/train/docs_used": 1672113, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.8296430110931396, |
|
"objective/train/theoretical_loss": 3.316464599777692, |
|
"objective/train/tokens_used": 81079776, |
|
"theoretical_loss": 3.316464599777692, |
|
"tokens_seen": 3051028480 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040622326775021386, |
|
"loss": 2.5403, |
|
"theoretical_loss": 3.316464599777692, |
|
"tokens_seen": 3051028480 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004061163387510693, |
|
"loss": 2.7452, |
|
"theoretical_loss": 3.316459141376166, |
|
"tokens_seen": 3051094016 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040600940975192475, |
|
"loss": 2.7405, |
|
"theoretical_loss": 3.31645368312471, |
|
"tokens_seen": 3051159552 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004059024807527802, |
|
"loss": 2.7306, |
|
"theoretical_loss": 3.3164482250233163, |
|
"tokens_seen": 3051225088 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004057955517536356, |
|
"loss": 2.5532, |
|
"theoretical_loss": 3.3164427670719783, |
|
"tokens_seen": 3051290624 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.000405688622754491, |
|
"loss": 2.5735, |
|
"theoretical_loss": 3.316437309270688, |
|
"tokens_seen": 3051356160 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004055816937553465, |
|
"loss": 2.9272, |
|
"theoretical_loss": 3.316431851619438, |
|
"tokens_seen": 3051421696 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040547476475620185, |
|
"loss": 2.5735, |
|
"theoretical_loss": 3.316426394118222, |
|
"tokens_seen": 3051487232 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004053678357570573, |
|
"loss": 2.762, |
|
"theoretical_loss": 3.316420936767031, |
|
"tokens_seen": 3051552768 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004052609067579128, |
|
"loss": 2.5567, |
|
"theoretical_loss": 3.316415479565859, |
|
"tokens_seen": 3051618304 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040515397775876816, |
|
"loss": 2.7632, |
|
"theoretical_loss": 3.3164100225146984, |
|
"tokens_seen": 3051683840 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040504704875962363, |
|
"loss": 2.6272, |
|
"theoretical_loss": 3.3164045656135417, |
|
"tokens_seen": 3051749376 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040494011976047905, |
|
"loss": 2.6097, |
|
"theoretical_loss": 3.316399108862381, |
|
"tokens_seen": 3051814912 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040483319076133447, |
|
"loss": 2.6183, |
|
"theoretical_loss": 3.3163936522612096, |
|
"tokens_seen": 3051880448 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004047262617621899, |
|
"loss": 2.6047, |
|
"theoretical_loss": 3.3163881958100205, |
|
"tokens_seen": 3051945984 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040461933276304536, |
|
"loss": 2.4767, |
|
"theoretical_loss": 3.3163827395088052, |
|
"tokens_seen": 3052011520 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004045124037639008, |
|
"loss": 2.8153, |
|
"theoretical_loss": 3.3163772833575575, |
|
"tokens_seen": 3052077056 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004044054747647562, |
|
"loss": 2.6842, |
|
"theoretical_loss": 3.3163718273562695, |
|
"tokens_seen": 3052142592 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040429854576561167, |
|
"loss": 2.6261, |
|
"theoretical_loss": 3.316366371504934, |
|
"tokens_seen": 3052208128 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004041916167664671, |
|
"loss": 2.4601, |
|
"theoretical_loss": 3.3163609158035436, |
|
"tokens_seen": 3052273664 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004040846877673225, |
|
"loss": 2.4056, |
|
"theoretical_loss": 3.316355460252091, |
|
"tokens_seen": 3052339200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004039777587681779, |
|
"loss": 2.6035, |
|
"theoretical_loss": 3.3163500048505687, |
|
"tokens_seen": 3052404736 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004038708297690334, |
|
"loss": 2.5094, |
|
"theoretical_loss": 3.3163445495989694, |
|
"tokens_seen": 3052470272 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040376390076988876, |
|
"loss": 2.7304, |
|
"theoretical_loss": 3.3163390944972857, |
|
"tokens_seen": 3052535808 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040365697177074424, |
|
"loss": 2.4984, |
|
"theoretical_loss": 3.316333639545511, |
|
"tokens_seen": 3052601344 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"objective/train/docs_used": 1673312, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.1437244415283203, |
|
"objective/train/theoretical_loss": 3.316328184743637, |
|
"objective/train/tokens_used": 82718176, |
|
"theoretical_loss": 3.316328184743637, |
|
"tokens_seen": 3052666880 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040355004277159965, |
|
"loss": 2.4723, |
|
"theoretical_loss": 3.316328184743637, |
|
"tokens_seen": 3052666880 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004034431137724551, |
|
"loss": 2.5947, |
|
"theoretical_loss": 3.316322730091657, |
|
"tokens_seen": 3052732416 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040333618477331055, |
|
"loss": 2.6962, |
|
"theoretical_loss": 3.3163172755895634, |
|
"tokens_seen": 3052797952 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040322925577416596, |
|
"loss": 2.6721, |
|
"theoretical_loss": 3.3163118212373486, |
|
"tokens_seen": 3052863488 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040312232677502144, |
|
"loss": 2.8194, |
|
"theoretical_loss": 3.3163063670350055, |
|
"tokens_seen": 3052929024 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004030153977758768, |
|
"loss": 2.4612, |
|
"theoretical_loss": 3.316300912982527, |
|
"tokens_seen": 3052994560 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004029084687767323, |
|
"loss": 2.7791, |
|
"theoretical_loss": 3.3162954590799054, |
|
"tokens_seen": 3053060096 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004028015397775877, |
|
"loss": 2.5944, |
|
"theoretical_loss": 3.316290005327134, |
|
"tokens_seen": 3053125632 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004026946107784431, |
|
"loss": 2.5546, |
|
"theoretical_loss": 3.316284551724204, |
|
"tokens_seen": 3053191168 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040258768177929853, |
|
"loss": 2.6204, |
|
"theoretical_loss": 3.31627909827111, |
|
"tokens_seen": 3053256704 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.000402480752780154, |
|
"loss": 2.5182, |
|
"theoretical_loss": 3.3162736449678434, |
|
"tokens_seen": 3053322240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004023738237810094, |
|
"loss": 2.6924, |
|
"theoretical_loss": 3.3162681918143972, |
|
"tokens_seen": 3053387776 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040226689478186484, |
|
"loss": 2.6499, |
|
"theoretical_loss": 3.3162627388107637, |
|
"tokens_seen": 3053453312 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004021599657827203, |
|
"loss": 2.6051, |
|
"theoretical_loss": 3.3162572859569366, |
|
"tokens_seen": 3053518848 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004020530367835757, |
|
"loss": 2.7286, |
|
"theoretical_loss": 3.316251833252908, |
|
"tokens_seen": 3053584384 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040194610778443115, |
|
"loss": 2.5385, |
|
"theoretical_loss": 3.3162463806986695, |
|
"tokens_seen": 3053649920 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040183917878528657, |
|
"loss": 2.5242, |
|
"theoretical_loss": 3.3162409282942154, |
|
"tokens_seen": 3053715456 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.00040173224978614204, |
|
"loss": 2.5566, |
|
"theoretical_loss": 3.316235476039538, |
|
"tokens_seen": 3053780992 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 0.0004016253207869974, |
|
"loss": 2.6922, |
|
"theoretical_loss": 3.316230023934629, |
|
"tokens_seen": 3053846528 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004015183917878529, |
|
"loss": 2.6956, |
|
"theoretical_loss": 3.316224571979482, |
|
"tokens_seen": 3053912064 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040141146278870835, |
|
"loss": 2.6225, |
|
"theoretical_loss": 3.3162191201740896, |
|
"tokens_seen": 3053977600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004013045337895637, |
|
"loss": 2.5757, |
|
"theoretical_loss": 3.3162136685184445, |
|
"tokens_seen": 3054043136 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004011976047904192, |
|
"loss": 2.5213, |
|
"theoretical_loss": 3.3162082170125387, |
|
"tokens_seen": 3054108672 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004010906757912746, |
|
"loss": 2.6254, |
|
"theoretical_loss": 3.3162027656563655, |
|
"tokens_seen": 3054174208 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040098374679213, |
|
"loss": 2.4652, |
|
"theoretical_loss": 3.3161973144499175, |
|
"tokens_seen": 3054239744 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"objective/train/docs_used": 1674056, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.5389561653137207, |
|
"objective/train/theoretical_loss": 3.316191863393187, |
|
"objective/train/tokens_used": 84356576, |
|
"theoretical_loss": 3.316191863393187, |
|
"tokens_seen": 3054305280 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040087681779298545, |
|
"loss": 2.6531, |
|
"theoretical_loss": 3.316191863393187, |
|
"tokens_seen": 3054305280 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004007698887938409, |
|
"loss": 2.8476, |
|
"theoretical_loss": 3.3161864124861675, |
|
"tokens_seen": 3054370816 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004006629597946963, |
|
"loss": 2.4375, |
|
"theoretical_loss": 3.316180961728851, |
|
"tokens_seen": 3054436352 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040055603079555176, |
|
"loss": 2.806, |
|
"theoretical_loss": 3.31617551112123, |
|
"tokens_seen": 3054501888 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004004491017964072, |
|
"loss": 2.4958, |
|
"theoretical_loss": 3.316170060663298, |
|
"tokens_seen": 3054567424 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040034217279726265, |
|
"loss": 2.741, |
|
"theoretical_loss": 3.316164610355047, |
|
"tokens_seen": 3054632960 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040023524379811807, |
|
"loss": 2.5739, |
|
"theoretical_loss": 3.3161591601964697, |
|
"tokens_seen": 3054698496 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0004001283147989735, |
|
"loss": 2.5663, |
|
"theoretical_loss": 3.316153710187559, |
|
"tokens_seen": 3054764032 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00040002138579982896, |
|
"loss": 2.8328, |
|
"theoretical_loss": 3.3161482603283075, |
|
"tokens_seen": 3054829568 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003999144568006843, |
|
"loss": 2.5696, |
|
"theoretical_loss": 3.316142810618708, |
|
"tokens_seen": 3054895104 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003998075278015398, |
|
"loss": 2.6351, |
|
"theoretical_loss": 3.316137361058753, |
|
"tokens_seen": 3054960640 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003997005988023952, |
|
"loss": 2.6533, |
|
"theoretical_loss": 3.3161319116484353, |
|
"tokens_seen": 3055026176 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039959366980325063, |
|
"loss": 2.9423, |
|
"theoretical_loss": 3.3161264623877473, |
|
"tokens_seen": 3055091712 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039948674080410605, |
|
"loss": 2.6148, |
|
"theoretical_loss": 3.3161210132766823, |
|
"tokens_seen": 3055157248 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003993798118049615, |
|
"loss": 2.6636, |
|
"theoretical_loss": 3.3161155643152327, |
|
"tokens_seen": 3055222784 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.000399272882805817, |
|
"loss": 2.8378, |
|
"theoretical_loss": 3.316110115503391, |
|
"tokens_seen": 3055288320 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039916595380667236, |
|
"loss": 2.5724, |
|
"theoretical_loss": 3.3161046668411496, |
|
"tokens_seen": 3055353856 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039905902480752783, |
|
"loss": 2.7957, |
|
"theoretical_loss": 3.316099218328502, |
|
"tokens_seen": 3055419392 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039895209580838325, |
|
"loss": 2.6997, |
|
"theoretical_loss": 3.3160937699654403, |
|
"tokens_seen": 3055484928 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039884516680923867, |
|
"loss": 2.6501, |
|
"theoretical_loss": 3.3160883217519572, |
|
"tokens_seen": 3055550464 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003987382378100941, |
|
"loss": 2.7211, |
|
"theoretical_loss": 3.316082873688045, |
|
"tokens_seen": 3055616000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039863130881094956, |
|
"loss": 2.6697, |
|
"theoretical_loss": 3.316077425773698, |
|
"tokens_seen": 3055681536 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003985243798118049, |
|
"loss": 2.5528, |
|
"theoretical_loss": 3.316071978008907, |
|
"tokens_seen": 3055747072 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003984174508126604, |
|
"loss": 2.7356, |
|
"theoretical_loss": 3.316066530393666, |
|
"tokens_seen": 3055812608 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039831052181351587, |
|
"loss": 2.5541, |
|
"theoretical_loss": 3.316061082927967, |
|
"tokens_seen": 3055878144 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"objective/train/docs_used": 1674689, |
|
"objective/train/instantaneous_batch_size": 16, |
|
"objective/train/instantaneous_microbatch_size": 16384, |
|
"objective/train/original_loss": 2.368884563446045, |
|
"objective/train/theoretical_loss": 3.3160556356118027, |
|
"objective/train/tokens_used": 85994976, |
|
"theoretical_loss": 3.3160556356118027, |
|
"tokens_seen": 3055943680 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039820359281437124, |
|
"loss": 2.7249, |
|
"theoretical_loss": 3.3160556356118027, |
|
"tokens_seen": 3055943680 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003980966638152267, |
|
"loss": 2.8312, |
|
"theoretical_loss": 3.316050188445166, |
|
"tokens_seen": 3056009216 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039798973481608213, |
|
"loss": 2.7138, |
|
"theoretical_loss": 3.3160447414280494, |
|
"tokens_seen": 3056074752 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003978828058169376, |
|
"loss": 2.6114, |
|
"theoretical_loss": 3.3160392945604458, |
|
"tokens_seen": 3056140288 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039777587681779297, |
|
"loss": 2.7356, |
|
"theoretical_loss": 3.316033847842348, |
|
"tokens_seen": 3056205824 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039766894781864844, |
|
"loss": 2.6253, |
|
"theoretical_loss": 3.316028401273748, |
|
"tokens_seen": 3056271360 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039756201881950386, |
|
"loss": 2.4828, |
|
"theoretical_loss": 3.3160229548546396, |
|
"tokens_seen": 3056336896 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003974550898203593, |
|
"loss": 2.7273, |
|
"theoretical_loss": 3.3160175085850145, |
|
"tokens_seen": 3056402432 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039734816082121475, |
|
"loss": 2.7228, |
|
"theoretical_loss": 3.316012062464866, |
|
"tokens_seen": 3056467968 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.00039724123182207017, |
|
"loss": 2.528, |
|
"theoretical_loss": 3.316006616494186, |
|
"tokens_seen": 3056533504 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003971343028229256, |
|
"loss": 2.7335, |
|
"theoretical_loss": 3.316001170672968, |
|
"tokens_seen": 3056599040 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.000397027373823781, |
|
"loss": 2.6632, |
|
"theoretical_loss": 3.315995725001205, |
|
"tokens_seen": 3056664576 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"learning_rate": 0.0003969204448246365, |
|
"loss": 2.4579, |
|
"theoretical_loss": 3.3159902794788887, |
|
"tokens_seen": 3056730112 |
|
} |
|
], |
|
"max_steps": 4724, |
|
"num_train_epochs": 9223372036854775807, |
|
"total_flos": 3.3846724657152e+16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|