{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.795063575168287, "eval_steps": 100, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05983545250560957, "grad_norm": 7.40893212326487, "learning_rate": 1.96078431372549e-06, "loss": 0.6407, "step": 10 }, { "epoch": 0.11967090501121914, "grad_norm": 2.443879636951024, "learning_rate": 3.92156862745098e-06, "loss": 0.3887, "step": 20 }, { "epoch": 0.17950635751682872, "grad_norm": 2.182405695242156, "learning_rate": 5.882352941176471e-06, "loss": 0.2616, "step": 30 }, { "epoch": 0.2393418100224383, "grad_norm": 2.01489573168189, "learning_rate": 7.84313725490196e-06, "loss": 0.2571, "step": 40 }, { "epoch": 0.2991772625280479, "grad_norm": 1.979013183070111, "learning_rate": 9.803921568627451e-06, "loss": 0.2477, "step": 50 }, { "epoch": 0.35901271503365745, "grad_norm": 1.549355728093093, "learning_rate": 9.990133642141359e-06, "loss": 0.2176, "step": 60 }, { "epoch": 0.418848167539267, "grad_norm": 1.4426481204872057, "learning_rate": 9.95607770125771e-06, "loss": 0.2273, "step": 70 }, { "epoch": 0.4786836200448766, "grad_norm": 1.7789832633152836, "learning_rate": 9.89787624799672e-06, "loss": 0.2125, "step": 80 }, { "epoch": 0.5385190725504861, "grad_norm": 1.529318217095282, "learning_rate": 9.815812833988292e-06, "loss": 0.2229, "step": 90 }, { "epoch": 0.5983545250560958, "grad_norm": 1.6259265112226373, "learning_rate": 9.710287263936485e-06, "loss": 0.2062, "step": 100 }, { "epoch": 0.5983545250560958, "eval_loss": 0.2123425155878067, "eval_runtime": 33.4729, "eval_samples_per_second": 17.776, "eval_steps_per_second": 8.903, "step": 100 }, { "epoch": 0.6581899775617053, "grad_norm": 1.6245243576341002, "learning_rate": 9.581813647811199e-06, "loss": 0.2105, "step": 110 }, { "epoch": 0.7180254300673149, "grad_norm": 1.731561075586601, "learning_rate": 9.431017896156074e-06, "loss": 0.2048, "step": 120 }, { "epoch": 0.7778608825729244, "grad_norm": 1.7874480467541498, "learning_rate": 9.25863467071524e-06, "loss": 0.2113, "step": 130 }, { "epoch": 0.837696335078534, "grad_norm": 1.3708463663991368, "learning_rate": 9.065503805235139e-06, "loss": 0.1988, "step": 140 }, { "epoch": 0.8975317875841436, "grad_norm": 1.3567660521800535, "learning_rate": 8.852566213878947e-06, "loss": 0.2038, "step": 150 }, { "epoch": 0.9573672400897532, "grad_norm": 1.8281708498422444, "learning_rate": 8.620859307187339e-06, "loss": 0.2196, "step": 160 }, { "epoch": 1.0172026925953628, "grad_norm": 1.2318054900550177, "learning_rate": 8.371511937918616e-06, "loss": 0.1762, "step": 170 }, { "epoch": 1.0770381451009723, "grad_norm": 1.568321912319435, "learning_rate": 8.105738901391553e-06, "loss": 0.1288, "step": 180 }, { "epoch": 1.136873597606582, "grad_norm": 1.3819346363939895, "learning_rate": 7.82483501712469e-06, "loss": 0.1214, "step": 190 }, { "epoch": 1.1967090501121915, "grad_norm": 1.2680685647450163, "learning_rate": 7.530168820605819e-06, "loss": 0.1256, "step": 200 }, { "epoch": 1.1967090501121915, "eval_loss": 0.20358169078826904, "eval_runtime": 32.7594, "eval_samples_per_second": 18.163, "eval_steps_per_second": 9.097, "step": 200 }, { "epoch": 1.256544502617801, "grad_norm": 1.2942802177914767, "learning_rate": 7.223175895924638e-06, "loss": 0.1241, "step": 210 }, { "epoch": 1.3163799551234106, "grad_norm": 1.4364370392498633, "learning_rate": 6.905351881751372e-06, "loss": 0.1254, "step": 220 }, { "epoch": 1.37621540762902, "grad_norm": 1.330811194933078, "learning_rate": 6.578245184735513e-06, "loss": 0.1229, "step": 230 }, { "epoch": 1.4360508601346298, "grad_norm": 1.304831888309303, "learning_rate": 6.243449435824276e-06, "loss": 0.1147, "step": 240 }, { "epoch": 1.4958863126402393, "grad_norm": 1.2398683599838292, "learning_rate": 5.902595726252801e-06, "loss": 0.1345, "step": 250 }, { "epoch": 1.555721765145849, "grad_norm": 1.3240317320353998, "learning_rate": 5.557344661031628e-06, "loss": 0.1236, "step": 260 }, { "epoch": 1.6155572176514585, "grad_norm": 1.518581095835922, "learning_rate": 5.209378268645998e-06, "loss": 0.1218, "step": 270 }, { "epoch": 1.675392670157068, "grad_norm": 1.5653129689570715, "learning_rate": 4.860391806382157e-06, "loss": 0.1246, "step": 280 }, { "epoch": 1.7352281226626776, "grad_norm": 1.4836280079781416, "learning_rate": 4.512085501204254e-06, "loss": 0.1156, "step": 290 }, { "epoch": 1.795063575168287, "grad_norm": 1.4998045733125407, "learning_rate": 4.166156266419489e-06, "loss": 0.1296, "step": 300 }, { "epoch": 1.795063575168287, "eval_loss": 0.19370371103286743, "eval_runtime": 33.117, "eval_samples_per_second": 17.967, "eval_steps_per_second": 8.998, "step": 300 } ], "logging_steps": 10, "max_steps": 501, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5914416119808.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }