{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9328575759695888, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 0.3762490749359131, "learning_rate": 0.00046641791044776124, "loss": 8.4353, "step": 500 }, { "epoch": 0.02, "eval_loss": 6.534626483917236, "eval_runtime": 213.0459, "eval_samples_per_second": 264.877, "eval_steps_per_second": 8.28, "step": 500 }, { "epoch": 0.05, "grad_norm": 1.518302321434021, "learning_rate": 0.0009328358208955225, "loss": 5.5553, "step": 1000 }, { "epoch": 0.05, "eval_loss": 4.976442813873291, "eval_runtime": 215.6331, "eval_samples_per_second": 261.699, "eval_steps_per_second": 8.181, "step": 1000 }, { "epoch": 0.07, "grad_norm": 1.8457584381103516, "learning_rate": 0.0009789877154220063, "loss": 4.5996, "step": 1500 }, { "epoch": 0.07, "eval_loss": 4.408080101013184, "eval_runtime": 215.4944, "eval_samples_per_second": 261.868, "eval_steps_per_second": 8.186, "step": 1500 }, { "epoch": 0.09, "grad_norm": 1.3413056135177612, "learning_rate": 0.0009544406539991162, "loss": 4.1961, "step": 2000 }, { "epoch": 0.09, "eval_loss": 4.121068954467773, "eval_runtime": 216.0979, "eval_samples_per_second": 261.136, "eval_steps_per_second": 8.163, "step": 2000 }, { "epoch": 0.12, "grad_norm": 1.2387056350708008, "learning_rate": 0.0009298935925762261, "loss": 3.9584, "step": 2500 }, { "epoch": 0.12, "eval_loss": 3.929608106613159, "eval_runtime": 216.438, "eval_samples_per_second": 260.726, "eval_steps_per_second": 8.15, "step": 2500 }, { "epoch": 0.14, "grad_norm": 1.3727014064788818, "learning_rate": 0.0009053465311533362, "loss": 3.7833, "step": 3000 }, { "epoch": 0.14, "eval_loss": 3.787693738937378, "eval_runtime": 218.2815, "eval_samples_per_second": 258.524, "eval_steps_per_second": 8.081, "step": 3000 }, { "epoch": 0.16, "grad_norm": 1.5133860111236572, "learning_rate": 0.0008807994697304462, "loss": 3.6616, "step": 3500 }, { "epoch": 0.16, "eval_loss": 3.683837652206421, "eval_runtime": 217.4429, "eval_samples_per_second": 259.521, "eval_steps_per_second": 8.112, "step": 3500 }, { "epoch": 0.19, "grad_norm": 1.3134872913360596, "learning_rate": 0.0008562524083075562, "loss": 3.5679, "step": 4000 }, { "epoch": 0.19, "eval_loss": 3.6060750484466553, "eval_runtime": 216.1087, "eval_samples_per_second": 261.123, "eval_steps_per_second": 8.163, "step": 4000 }, { "epoch": 0.21, "grad_norm": 1.3464657068252563, "learning_rate": 0.0008317053468846664, "loss": 3.4964, "step": 4500 }, { "epoch": 0.21, "eval_loss": 3.54757022857666, "eval_runtime": 216.7125, "eval_samples_per_second": 260.396, "eval_steps_per_second": 8.14, "step": 4500 }, { "epoch": 0.23, "grad_norm": 1.3200907707214355, "learning_rate": 0.0008071582854617764, "loss": 3.4488, "step": 5000 }, { "epoch": 0.23, "eval_loss": 3.4986045360565186, "eval_runtime": 217.9581, "eval_samples_per_second": 258.908, "eval_steps_per_second": 8.093, "step": 5000 }, { "epoch": 0.26, "grad_norm": 1.514103651046753, "learning_rate": 0.0007826112240388863, "loss": 3.3973, "step": 5500 }, { "epoch": 0.26, "eval_loss": 3.4664642810821533, "eval_runtime": 218.2073, "eval_samples_per_second": 258.612, "eval_steps_per_second": 8.084, "step": 5500 }, { "epoch": 0.28, "grad_norm": 1.4389874935150146, "learning_rate": 0.0007580641626159963, "loss": 3.3587, "step": 6000 }, { "epoch": 0.28, "eval_loss": 3.4231605529785156, "eval_runtime": 218.4087, "eval_samples_per_second": 258.373, "eval_steps_per_second": 8.077, "step": 6000 }, { "epoch": 0.3, "grad_norm": 1.4937726259231567, "learning_rate": 0.0007335171011931065, "loss": 3.3231, "step": 6500 }, { "epoch": 0.3, "eval_loss": 3.390798330307007, "eval_runtime": 216.9327, "eval_samples_per_second": 260.131, "eval_steps_per_second": 8.132, "step": 6500 }, { "epoch": 0.33, "grad_norm": 1.5942374467849731, "learning_rate": 0.0007089700397702165, "loss": 3.2971, "step": 7000 }, { "epoch": 0.33, "eval_loss": 3.361341714859009, "eval_runtime": 217.461, "eval_samples_per_second": 259.499, "eval_steps_per_second": 8.112, "step": 7000 }, { "epoch": 0.35, "grad_norm": 1.6310980319976807, "learning_rate": 0.0006844229783473265, "loss": 3.2679, "step": 7500 }, { "epoch": 0.35, "eval_loss": 3.33766770362854, "eval_runtime": 217.3952, "eval_samples_per_second": 259.578, "eval_steps_per_second": 8.114, "step": 7500 }, { "epoch": 0.37, "grad_norm": 1.6001648902893066, "learning_rate": 0.0006598759169244364, "loss": 3.2436, "step": 8000 }, { "epoch": 0.37, "eval_loss": 3.3167307376861572, "eval_runtime": 217.5617, "eval_samples_per_second": 259.379, "eval_steps_per_second": 8.108, "step": 8000 }, { "epoch": 0.4, "grad_norm": 1.590154767036438, "learning_rate": 0.0006353288555015467, "loss": 3.2322, "step": 8500 }, { "epoch": 0.4, "eval_loss": 3.2952842712402344, "eval_runtime": 217.0219, "eval_samples_per_second": 260.024, "eval_steps_per_second": 8.128, "step": 8500 }, { "epoch": 0.42, "grad_norm": 1.7063907384872437, "learning_rate": 0.0006107817940786566, "loss": 3.208, "step": 9000 }, { "epoch": 0.42, "eval_loss": 3.2771995067596436, "eval_runtime": 218.9112, "eval_samples_per_second": 257.78, "eval_steps_per_second": 8.058, "step": 9000 }, { "epoch": 0.44, "grad_norm": 1.544815182685852, "learning_rate": 0.0005862347326557666, "loss": 3.1923, "step": 9500 }, { "epoch": 0.44, "eval_loss": 3.263777017593384, "eval_runtime": 218.0114, "eval_samples_per_second": 258.844, "eval_steps_per_second": 8.091, "step": 9500 }, { "epoch": 0.47, "grad_norm": 1.615902304649353, "learning_rate": 0.0005616876712328766, "loss": 3.1683, "step": 10000 }, { "epoch": 0.47, "eval_loss": 3.2488255500793457, "eval_runtime": 217.5444, "eval_samples_per_second": 259.4, "eval_steps_per_second": 8.109, "step": 10000 }, { "epoch": 0.49, "grad_norm": 1.533378005027771, "learning_rate": 0.0005371897039328324, "loss": 3.158, "step": 10500 }, { "epoch": 0.49, "eval_loss": 3.2329940795898438, "eval_runtime": 214.5885, "eval_samples_per_second": 262.973, "eval_steps_per_second": 8.22, "step": 10500 }, { "epoch": 0.51, "grad_norm": 1.9465535879135132, "learning_rate": 0.0005126426425099425, "loss": 3.1439, "step": 11000 }, { "epoch": 0.51, "eval_loss": 3.2245969772338867, "eval_runtime": 216.8559, "eval_samples_per_second": 260.224, "eval_steps_per_second": 8.134, "step": 11000 }, { "epoch": 0.54, "grad_norm": 1.7405271530151367, "learning_rate": 0.0004880955810870526, "loss": 3.1374, "step": 11500 }, { "epoch": 0.54, "eval_loss": 3.208115339279175, "eval_runtime": 217.1863, "eval_samples_per_second": 259.828, "eval_steps_per_second": 8.122, "step": 11500 }, { "epoch": 0.56, "grad_norm": 1.6891497373580933, "learning_rate": 0.0004635485196641626, "loss": 3.1213, "step": 12000 }, { "epoch": 0.56, "eval_loss": 3.200620651245117, "eval_runtime": 218.4169, "eval_samples_per_second": 258.364, "eval_steps_per_second": 8.076, "step": 12000 }, { "epoch": 0.58, "grad_norm": 1.73305082321167, "learning_rate": 0.00043909964648696415, "loss": 3.1144, "step": 12500 }, { "epoch": 0.58, "eval_loss": 3.188239812850952, "eval_runtime": 215.556, "eval_samples_per_second": 261.793, "eval_steps_per_second": 8.183, "step": 12500 }, { "epoch": 0.61, "grad_norm": 1.7891405820846558, "learning_rate": 0.0004145525850640742, "loss": 3.0963, "step": 13000 }, { "epoch": 0.61, "eval_loss": 3.174497365951538, "eval_runtime": 214.2396, "eval_samples_per_second": 263.401, "eval_steps_per_second": 8.234, "step": 13000 }, { "epoch": 0.63, "grad_norm": 1.6677337884902954, "learning_rate": 0.00039000552364118425, "loss": 3.097, "step": 13500 }, { "epoch": 0.63, "eval_loss": 3.165832042694092, "eval_runtime": 214.1769, "eval_samples_per_second": 263.479, "eval_steps_per_second": 8.236, "step": 13500 }, { "epoch": 0.65, "grad_norm": 1.7241294384002686, "learning_rate": 0.00036545846221829433, "loss": 3.076, "step": 14000 }, { "epoch": 0.65, "eval_loss": 3.1584606170654297, "eval_runtime": 264.545, "eval_samples_per_second": 213.313, "eval_steps_per_second": 6.668, "step": 14000 }, { "epoch": 0.68, "grad_norm": 1.835271954536438, "learning_rate": 0.00034091140079540435, "loss": 3.0666, "step": 14500 }, { "epoch": 0.68, "eval_loss": 3.151036262512207, "eval_runtime": 218.4608, "eval_samples_per_second": 258.312, "eval_steps_per_second": 8.075, "step": 14500 }, { "epoch": 0.7, "grad_norm": 1.7558091878890991, "learning_rate": 0.00031636433937251427, "loss": 3.0709, "step": 15000 }, { "epoch": 0.7, "eval_loss": 3.139883279800415, "eval_runtime": 11111.1382, "eval_samples_per_second": 5.079, "eval_steps_per_second": 0.159, "step": 15000 }, { "epoch": 0.72, "grad_norm": 1.6270012855529785, "learning_rate": 0.00029186637207247017, "loss": 3.0609, "step": 15500 }, { "epoch": 0.72, "eval_loss": 3.1347908973693848, "eval_runtime": 258.6315, "eval_samples_per_second": 218.191, "eval_steps_per_second": 6.821, "step": 15500 }, { "epoch": 0.75, "grad_norm": 1.971229910850525, "learning_rate": 0.0002673193106495802, "loss": 3.0527, "step": 16000 }, { "epoch": 0.75, "eval_loss": 3.127486228942871, "eval_runtime": 243.3689, "eval_samples_per_second": 231.874, "eval_steps_per_second": 7.248, "step": 16000 }, { "epoch": 0.77, "grad_norm": 1.7010846138000488, "learning_rate": 0.00024277224922669027, "loss": 3.0449, "step": 16500 }, { "epoch": 0.77, "eval_loss": 3.122706890106201, "eval_runtime": 259.8928, "eval_samples_per_second": 217.132, "eval_steps_per_second": 6.787, "step": 16500 }, { "epoch": 0.79, "grad_norm": 2.0105388164520264, "learning_rate": 0.00021822518780380032, "loss": 3.0407, "step": 17000 }, { "epoch": 0.79, "eval_loss": 3.1165504455566406, "eval_runtime": 264.3004, "eval_samples_per_second": 213.511, "eval_steps_per_second": 6.674, "step": 17000 }, { "epoch": 0.82, "grad_norm": 1.7834789752960205, "learning_rate": 0.00019372722050375605, "loss": 3.0376, "step": 17500 }, { "epoch": 0.82, "eval_loss": 3.10659122467041, "eval_runtime": 265.7435, "eval_samples_per_second": 212.351, "eval_steps_per_second": 6.638, "step": 17500 }, { "epoch": 0.84, "grad_norm": 1.8829165697097778, "learning_rate": 0.0001691801590808661, "loss": 3.0251, "step": 18000 }, { "epoch": 0.84, "eval_loss": 3.1006741523742676, "eval_runtime": 267.8646, "eval_samples_per_second": 210.67, "eval_steps_per_second": 6.585, "step": 18000 }, { "epoch": 0.86, "grad_norm": 1.7196826934814453, "learning_rate": 0.00014463309765797616, "loss": 3.0226, "step": 18500 }, { "epoch": 0.86, "eval_loss": 3.0954883098602295, "eval_runtime": 263.6733, "eval_samples_per_second": 214.019, "eval_steps_per_second": 6.69, "step": 18500 }, { "epoch": 0.89, "grad_norm": 1.7544931173324585, "learning_rate": 0.00012008603623508621, "loss": 3.0203, "step": 19000 }, { "epoch": 0.89, "eval_loss": 3.0885541439056396, "eval_runtime": 225.0692, "eval_samples_per_second": 250.727, "eval_steps_per_second": 7.838, "step": 19000 }, { "epoch": 0.91, "grad_norm": 1.659536361694336, "learning_rate": 9.558806893504194e-05, "loss": 3.0151, "step": 19500 }, { "epoch": 0.91, "eval_loss": 3.083906888961792, "eval_runtime": 257.4684, "eval_samples_per_second": 219.176, "eval_steps_per_second": 6.851, "step": 19500 }, { "epoch": 0.93, "grad_norm": 1.6146643161773682, "learning_rate": 7.1041007512152e-05, "loss": 3.0018, "step": 20000 }, { "epoch": 0.93, "eval_loss": 3.0793216228485107, "eval_runtime": 231.3706, "eval_samples_per_second": 243.899, "eval_steps_per_second": 7.624, "step": 20000 } ], "logging_steps": 500, "max_steps": 21439, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 196340613120000.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }