{ "best_metric": 1.4920570850372314, "best_model_checkpoint": "./models_trained/sft_full_df_2/checkpoint-6696", "epoch": 2.9989922741014445, "eval_steps": 2232, "global_step": 6696, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08017019370731161, "grad_norm": 9.433825492858887, "learning_rate": 1.0022396416573349e-06, "loss": 2.5707, "step": 179 }, { "epoch": 0.16034038741462323, "grad_norm": 5.339526176452637, "learning_rate": 2.0044792833146697e-06, "loss": 1.9288, "step": 358 }, { "epoch": 0.24051058112193482, "grad_norm": 6.231087684631348, "learning_rate": 3.0067189249720046e-06, "loss": 1.6422, "step": 537 }, { "epoch": 0.32068077482924645, "grad_norm": 6.053775787353516, "learning_rate": 4.0089585666293395e-06, "loss": 1.6095, "step": 716 }, { "epoch": 0.40085096853655805, "grad_norm": 6.309902667999268, "learning_rate": 4.999999235639976e-06, "loss": 1.5876, "step": 895 }, { "epoch": 0.48102116224386965, "grad_norm": 6.86934757232666, "learning_rate": 4.993742312315323e-06, "loss": 1.5839, "step": 1074 }, { "epoch": 0.5611913559511813, "grad_norm": 5.88812255859375, "learning_rate": 4.975275594860625e-06, "loss": 1.5634, "step": 1253 }, { "epoch": 0.6413615496584929, "grad_norm": 6.350925445556641, "learning_rate": 4.944689499521886e-06, "loss": 1.5406, "step": 1432 }, { "epoch": 0.7215317433658045, "grad_norm": 6.42539119720459, "learning_rate": 4.902133781117591e-06, "loss": 1.5544, "step": 1611 }, { "epoch": 0.8017019370731161, "grad_norm": 6.217959880828857, "learning_rate": 4.847816799813184e-06, "loss": 1.5301, "step": 1790 }, { "epoch": 0.8818721307804277, "grad_norm": 6.820368766784668, "learning_rate": 4.782004500953626e-06, "loss": 1.5179, "step": 1969 }, { "epoch": 0.9620423244877393, "grad_norm": 7.019912242889404, "learning_rate": 4.705019112948941e-06, "loss": 1.5084, "step": 2148 }, { "epoch": 0.9996640913671482, "eval_loss": 1.5233653783798218, "eval_runtime": 72.6117, "eval_samples_per_second": 27.337, "eval_steps_per_second": 13.675, "step": 2232 }, { "epoch": 1.042212518195051, "grad_norm": 6.613691806793213, "learning_rate": 4.617237569588121e-06, "loss": 1.5067, "step": 2327 }, { "epoch": 1.1223827119023626, "grad_norm": 7.442265510559082, "learning_rate": 4.519089664506044e-06, "loss": 1.4983, "step": 2506 }, { "epoch": 1.2025529056096742, "grad_norm": 9.082566261291504, "learning_rate": 4.411055946839413e-06, "loss": 1.4935, "step": 2685 }, { "epoch": 1.2827230993169858, "grad_norm": 7.871016979217529, "learning_rate": 4.293665368374987e-06, "loss": 1.4923, "step": 2864 }, { "epoch": 1.3628932930242974, "grad_norm": 7.989074230194092, "learning_rate": 4.167492693710046e-06, "loss": 1.4794, "step": 3043 }, { "epoch": 1.443063486731609, "grad_norm": 7.531464576721191, "learning_rate": 4.033155686105407e-06, "loss": 1.4967, "step": 3222 }, { "epoch": 1.5232336804389206, "grad_norm": 7.842589378356934, "learning_rate": 3.8913120828095415e-06, "loss": 1.4583, "step": 3401 }, { "epoch": 1.6034038741462322, "grad_norm": 8.500748634338379, "learning_rate": 3.7426563746631257e-06, "loss": 1.4787, "step": 3580 }, { "epoch": 1.6835740678535438, "grad_norm": 9.060755729675293, "learning_rate": 3.587916405751636e-06, "loss": 1.4766, "step": 3759 }, { "epoch": 1.7637442615608554, "grad_norm": 8.827229499816895, "learning_rate": 3.4278498097546904e-06, "loss": 1.472, "step": 3938 }, { "epoch": 1.843914455268167, "grad_norm": 8.073822975158691, "learning_rate": 3.2632403004403746e-06, "loss": 1.4988, "step": 4117 }, { "epoch": 1.9240846489754788, "grad_norm": 9.162446022033691, "learning_rate": 3.0948938344669414e-06, "loss": 1.4583, "step": 4296 }, { "epoch": 1.9993281827342964, "eval_loss": 1.496500849723816, "eval_runtime": 72.6334, "eval_samples_per_second": 27.329, "eval_steps_per_second": 13.671, "step": 4464 }, { "epoch": 2.00425484268279, "grad_norm": 9.242565155029297, "learning_rate": 2.9236346652794664e-06, "loss": 1.4732, "step": 4475 }, { "epoch": 2.084425036390102, "grad_norm": 9.86392593383789, "learning_rate": 2.750301307422268e-06, "loss": 1.4263, "step": 4654 }, { "epoch": 2.1645952300974134, "grad_norm": 10.472150802612305, "learning_rate": 2.575742431026521e-06, "loss": 1.4263, "step": 4833 }, { "epoch": 2.244765423804725, "grad_norm": 11.642035484313965, "learning_rate": 2.40081270657435e-06, "loss": 1.4318, "step": 5012 }, { "epoch": 2.3249356175120366, "grad_norm": 11.944063186645508, "learning_rate": 2.226368620284175e-06, "loss": 1.4432, "step": 5191 }, { "epoch": 2.4051058112193484, "grad_norm": 10.567192077636719, "learning_rate": 2.0532642806058894e-06, "loss": 1.4353, "step": 5370 }, { "epoch": 2.48527600492666, "grad_norm": 13.025165557861328, "learning_rate": 1.8823472363580105e-06, "loss": 1.4327, "step": 5549 }, { "epoch": 2.5654461986339716, "grad_norm": 9.754578590393066, "learning_rate": 1.714454326981919e-06, "loss": 1.4214, "step": 5728 }, { "epoch": 2.6456163923412834, "grad_norm": 9.988022804260254, "learning_rate": 1.5504075852310582e-06, "loss": 1.4299, "step": 5907 }, { "epoch": 2.725786586048595, "grad_norm": 9.87160587310791, "learning_rate": 1.3910102123562535e-06, "loss": 1.4144, "step": 6086 }, { "epoch": 2.805956779755906, "grad_norm": 9.950671195983887, "learning_rate": 1.2370426454933122e-06, "loss": 1.4077, "step": 6265 }, { "epoch": 2.886126973463218, "grad_norm": 11.326949119567871, "learning_rate": 1.0892587365076916e-06, "loss": 1.4311, "step": 6444 }, { "epoch": 2.96629716717053, "grad_norm": 11.63015365600586, "learning_rate": 9.483820610052311e-07, "loss": 1.4271, "step": 6623 }, { "epoch": 2.9989922741014445, "eval_loss": 1.4920570850372314, "eval_runtime": 72.6497, "eval_samples_per_second": 27.323, "eval_steps_per_second": 13.668, "step": 6696 } ], "logging_steps": 179, "max_steps": 8928, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2232, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.021394676596736e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }