{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 1908, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07861635220125786, "grad_norm": 0.30994901061058044, "learning_rate": 0.00013089005235602096, "loss": 4.0802, "step": 50 }, { "epoch": 0.07861635220125786, "eval_loss": 3.3922524452209473, "eval_runtime": 27.5047, "eval_samples_per_second": 3.636, "eval_steps_per_second": 0.473, "step": 50 }, { "epoch": 0.15723270440251572, "grad_norm": 0.1917889267206192, "learning_rate": 0.0002617801047120419, "loss": 2.6833, "step": 100 }, { "epoch": 0.15723270440251572, "eval_loss": 2.327249050140381, "eval_runtime": 27.3498, "eval_samples_per_second": 3.656, "eval_steps_per_second": 0.475, "step": 100 }, { "epoch": 0.2358490566037736, "grad_norm": 0.1460731029510498, "learning_rate": 0.00039267015706806284, "loss": 2.1534, "step": 150 }, { "epoch": 0.2358490566037736, "eval_loss": 2.1624627113342285, "eval_runtime": 27.2554, "eval_samples_per_second": 3.669, "eval_steps_per_second": 0.477, "step": 150 }, { "epoch": 0.31446540880503143, "grad_norm": 0.15231014788150787, "learning_rate": 0.0004973791496796739, "loss": 2.0476, "step": 200 }, { "epoch": 0.31446540880503143, "eval_loss": 2.100531816482544, "eval_runtime": 27.3815, "eval_samples_per_second": 3.652, "eval_steps_per_second": 0.475, "step": 200 }, { "epoch": 0.39308176100628933, "grad_norm": 0.1586989462375641, "learning_rate": 0.00048281887012230633, "loss": 1.9929, "step": 250 }, { "epoch": 0.39308176100628933, "eval_loss": 2.0717108249664307, "eval_runtime": 27.458, "eval_samples_per_second": 3.642, "eval_steps_per_second": 0.473, "step": 250 }, { "epoch": 0.4716981132075472, "grad_norm": 0.14461246132850647, "learning_rate": 0.00046825859056493884, "loss": 1.974, "step": 300 }, { "epoch": 0.4716981132075472, "eval_loss": 2.0508835315704346, "eval_runtime": 27.4544, "eval_samples_per_second": 3.642, "eval_steps_per_second": 0.474, "step": 300 }, { "epoch": 0.550314465408805, "grad_norm": 0.22759944200515747, "learning_rate": 0.00045369831100757136, "loss": 1.9854, "step": 350 }, { "epoch": 0.550314465408805, "eval_loss": 2.043459892272949, "eval_runtime": 27.4852, "eval_samples_per_second": 3.638, "eval_steps_per_second": 0.473, "step": 350 }, { "epoch": 0.6289308176100629, "grad_norm": 0.1476861983537674, "learning_rate": 0.0004391380314502039, "loss": 2.0109, "step": 400 }, { "epoch": 0.6289308176100629, "eval_loss": 2.035849094390869, "eval_runtime": 27.3716, "eval_samples_per_second": 3.653, "eval_steps_per_second": 0.475, "step": 400 }, { "epoch": 0.7075471698113207, "grad_norm": 0.16703520715236664, "learning_rate": 0.0004245777518928364, "loss": 1.9939, "step": 450 }, { "epoch": 0.7075471698113207, "eval_loss": 2.0285253524780273, "eval_runtime": 27.4183, "eval_samples_per_second": 3.647, "eval_steps_per_second": 0.474, "step": 450 }, { "epoch": 0.7861635220125787, "grad_norm": 0.14345824718475342, "learning_rate": 0.00041001747233546885, "loss": 1.9672, "step": 500 }, { "epoch": 0.7861635220125787, "eval_loss": 2.0150387287139893, "eval_runtime": 27.4063, "eval_samples_per_second": 3.649, "eval_steps_per_second": 0.474, "step": 500 }, { "epoch": 0.8647798742138365, "grad_norm": 0.15760189294815063, "learning_rate": 0.0003954571927781013, "loss": 1.9565, "step": 550 }, { "epoch": 0.8647798742138365, "eval_loss": 2.011507034301758, "eval_runtime": 27.3979, "eval_samples_per_second": 3.65, "eval_steps_per_second": 0.474, "step": 550 }, { "epoch": 0.9433962264150944, "grad_norm": 0.19006682932376862, "learning_rate": 0.0003808969132207338, "loss": 1.9706, "step": 600 }, { "epoch": 0.9433962264150944, "eval_loss": 2.006178379058838, "eval_runtime": 27.3191, "eval_samples_per_second": 3.66, "eval_steps_per_second": 0.476, "step": 600 }, { "epoch": 1.0220125786163523, "grad_norm": 0.13335242867469788, "learning_rate": 0.00036633663366336634, "loss": 1.9352, "step": 650 }, { "epoch": 1.0220125786163523, "eval_loss": 1.9949337244033813, "eval_runtime": 27.384, "eval_samples_per_second": 3.652, "eval_steps_per_second": 0.475, "step": 650 }, { "epoch": 1.10062893081761, "grad_norm": 0.1745985597372055, "learning_rate": 0.00035177635410599885, "loss": 1.8647, "step": 700 }, { "epoch": 1.10062893081761, "eval_loss": 1.9930100440979004, "eval_runtime": 27.4302, "eval_samples_per_second": 3.646, "eval_steps_per_second": 0.474, "step": 700 }, { "epoch": 1.179245283018868, "grad_norm": 0.17578236758708954, "learning_rate": 0.00033721607454863137, "loss": 1.9109, "step": 750 }, { "epoch": 1.179245283018868, "eval_loss": 1.9840075969696045, "eval_runtime": 27.4312, "eval_samples_per_second": 3.645, "eval_steps_per_second": 0.474, "step": 750 }, { "epoch": 1.2578616352201257, "grad_norm": 0.16206932067871094, "learning_rate": 0.0003226557949912638, "loss": 1.885, "step": 800 }, { "epoch": 1.2578616352201257, "eval_loss": 1.9805808067321777, "eval_runtime": 27.4061, "eval_samples_per_second": 3.649, "eval_steps_per_second": 0.474, "step": 800 }, { "epoch": 1.3364779874213837, "grad_norm": 0.13977627456188202, "learning_rate": 0.00030809551543389634, "loss": 1.8864, "step": 850 }, { "epoch": 1.3364779874213837, "eval_loss": 1.9878467321395874, "eval_runtime": 27.3272, "eval_samples_per_second": 3.659, "eval_steps_per_second": 0.476, "step": 850 }, { "epoch": 1.4150943396226414, "grad_norm": 0.19199030101299286, "learning_rate": 0.00029353523587652885, "loss": 1.8931, "step": 900 }, { "epoch": 1.4150943396226414, "eval_loss": 1.9824912548065186, "eval_runtime": 27.2809, "eval_samples_per_second": 3.666, "eval_steps_per_second": 0.477, "step": 900 }, { "epoch": 1.4937106918238994, "grad_norm": 0.17020899057388306, "learning_rate": 0.0002789749563191613, "loss": 1.8599, "step": 950 }, { "epoch": 1.4937106918238994, "eval_loss": 1.975517988204956, "eval_runtime": 27.4147, "eval_samples_per_second": 3.648, "eval_steps_per_second": 0.474, "step": 950 }, { "epoch": 1.5723270440251573, "grad_norm": 0.2071741223335266, "learning_rate": 0.00026441467676179383, "loss": 1.9019, "step": 1000 }, { "epoch": 1.5723270440251573, "eval_loss": 1.9695261716842651, "eval_runtime": 27.3628, "eval_samples_per_second": 3.655, "eval_steps_per_second": 0.475, "step": 1000 }, { "epoch": 1.650943396226415, "grad_norm": 0.18793335556983948, "learning_rate": 0.00024985439720442634, "loss": 1.9081, "step": 1050 }, { "epoch": 1.650943396226415, "eval_loss": 1.959928274154663, "eval_runtime": 27.4577, "eval_samples_per_second": 3.642, "eval_steps_per_second": 0.473, "step": 1050 }, { "epoch": 1.7295597484276728, "grad_norm": 0.2016116827726364, "learning_rate": 0.00023529411764705883, "loss": 1.8736, "step": 1100 }, { "epoch": 1.7295597484276728, "eval_loss": 1.9582524299621582, "eval_runtime": 27.3035, "eval_samples_per_second": 3.663, "eval_steps_per_second": 0.476, "step": 1100 }, { "epoch": 1.808176100628931, "grad_norm": 0.1896527260541916, "learning_rate": 0.00022073383808969132, "loss": 1.8939, "step": 1150 }, { "epoch": 1.808176100628931, "eval_loss": 1.9566823244094849, "eval_runtime": 27.4057, "eval_samples_per_second": 3.649, "eval_steps_per_second": 0.474, "step": 1150 }, { "epoch": 1.8867924528301887, "grad_norm": 0.17931561172008514, "learning_rate": 0.00020617355853232383, "loss": 1.8867, "step": 1200 }, { "epoch": 1.8867924528301887, "eval_loss": 1.9534107446670532, "eval_runtime": 27.4354, "eval_samples_per_second": 3.645, "eval_steps_per_second": 0.474, "step": 1200 }, { "epoch": 1.9654088050314464, "grad_norm": 0.21005648374557495, "learning_rate": 0.00019161327897495632, "loss": 1.8875, "step": 1250 }, { "epoch": 1.9654088050314464, "eval_loss": 1.947943925857544, "eval_runtime": 27.4109, "eval_samples_per_second": 3.648, "eval_steps_per_second": 0.474, "step": 1250 }, { "epoch": 2.0440251572327046, "grad_norm": 0.175890251994133, "learning_rate": 0.0001770529994175888, "loss": 1.8677, "step": 1300 }, { "epoch": 2.0440251572327046, "eval_loss": 1.949751853942871, "eval_runtime": 27.3631, "eval_samples_per_second": 3.655, "eval_steps_per_second": 0.475, "step": 1300 }, { "epoch": 2.1226415094339623, "grad_norm": 0.23929914832115173, "learning_rate": 0.00016249271986022132, "loss": 1.8152, "step": 1350 }, { "epoch": 2.1226415094339623, "eval_loss": 1.9526574611663818, "eval_runtime": 27.3536, "eval_samples_per_second": 3.656, "eval_steps_per_second": 0.475, "step": 1350 }, { "epoch": 2.20125786163522, "grad_norm": 0.25344499945640564, "learning_rate": 0.00014793244030285383, "loss": 1.8573, "step": 1400 }, { "epoch": 2.20125786163522, "eval_loss": 1.9523253440856934, "eval_runtime": 27.4028, "eval_samples_per_second": 3.649, "eval_steps_per_second": 0.474, "step": 1400 }, { "epoch": 2.279874213836478, "grad_norm": 0.23211850225925446, "learning_rate": 0.0001333721607454863, "loss": 1.8433, "step": 1450 }, { "epoch": 2.279874213836478, "eval_loss": 1.9440948963165283, "eval_runtime": 27.3179, "eval_samples_per_second": 3.661, "eval_steps_per_second": 0.476, "step": 1450 }, { "epoch": 2.358490566037736, "grad_norm": 0.23775285482406616, "learning_rate": 0.00011881188118811881, "loss": 1.828, "step": 1500 }, { "epoch": 2.358490566037736, "eval_loss": 1.945542335510254, "eval_runtime": 27.3283, "eval_samples_per_second": 3.659, "eval_steps_per_second": 0.476, "step": 1500 }, { "epoch": 2.4371069182389937, "grad_norm": 0.20624059438705444, "learning_rate": 0.00010425160163075131, "loss": 1.8298, "step": 1550 }, { "epoch": 2.4371069182389937, "eval_loss": 1.9422754049301147, "eval_runtime": 27.398, "eval_samples_per_second": 3.65, "eval_steps_per_second": 0.474, "step": 1550 }, { "epoch": 2.5157232704402515, "grad_norm": 0.20326970517635345, "learning_rate": 8.969132207338381e-05, "loss": 1.8258, "step": 1600 }, { "epoch": 2.5157232704402515, "eval_loss": 1.9376040697097778, "eval_runtime": 27.3017, "eval_samples_per_second": 3.663, "eval_steps_per_second": 0.476, "step": 1600 }, { "epoch": 2.5943396226415096, "grad_norm": 0.19999054074287415, "learning_rate": 7.513104251601631e-05, "loss": 1.8314, "step": 1650 }, { "epoch": 2.5943396226415096, "eval_loss": 1.937279462814331, "eval_runtime": 27.3919, "eval_samples_per_second": 3.651, "eval_steps_per_second": 0.475, "step": 1650 }, { "epoch": 2.6729559748427674, "grad_norm": 0.17829832434654236, "learning_rate": 6.0570762958648805e-05, "loss": 1.8436, "step": 1700 }, { "epoch": 2.6729559748427674, "eval_loss": 1.93943452835083, "eval_runtime": 27.4007, "eval_samples_per_second": 3.65, "eval_steps_per_second": 0.474, "step": 1700 }, { "epoch": 2.751572327044025, "grad_norm": 0.18850775063037872, "learning_rate": 4.6010483401281306e-05, "loss": 1.8253, "step": 1750 }, { "epoch": 2.751572327044025, "eval_loss": 1.937272310256958, "eval_runtime": 27.2797, "eval_samples_per_second": 3.666, "eval_steps_per_second": 0.477, "step": 1750 }, { "epoch": 2.830188679245283, "grad_norm": 0.19603122770786285, "learning_rate": 3.145020384391381e-05, "loss": 1.8055, "step": 1800 }, { "epoch": 2.830188679245283, "eval_loss": 1.9364655017852783, "eval_runtime": 27.2904, "eval_samples_per_second": 3.664, "eval_steps_per_second": 0.476, "step": 1800 }, { "epoch": 2.908805031446541, "grad_norm": 0.22059138119220734, "learning_rate": 1.68899242865463e-05, "loss": 1.8207, "step": 1850 }, { "epoch": 2.908805031446541, "eval_loss": 1.9341883659362793, "eval_runtime": 27.2939, "eval_samples_per_second": 3.664, "eval_steps_per_second": 0.476, "step": 1850 }, { "epoch": 2.9874213836477987, "grad_norm": 0.20625567436218262, "learning_rate": 2.3296447291788002e-06, "loss": 1.8514, "step": 1900 }, { "epoch": 2.9874213836477987, "eval_loss": 1.9331157207489014, "eval_runtime": 27.3592, "eval_samples_per_second": 3.655, "eval_steps_per_second": 0.475, "step": 1900 } ], "logging_steps": 50, "max_steps": 1908, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.377867800895488e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }