gemma-7b-sft-full-longest-1k-v0 / trainer_state.json
lewtun's picture
lewtun HF staff
Model save
3046e76 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 500,
"global_step": 90,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17,
"grad_norm": 1600.5740838054542,
"learning_rate": 1.9993908270190957e-06,
"loss": 10.0226,
"step": 1
},
{
"epoch": 0.83,
"grad_norm": 163.46605797672981,
"learning_rate": 1.984807753012208e-06,
"loss": 5.6993,
"step": 5
},
{
"epoch": 1.0,
"eval_loss": 2.8191075325012207,
"eval_runtime": 49.0667,
"eval_samples_per_second": 20.095,
"eval_steps_per_second": 0.326,
"step": 6
},
{
"epoch": 1.67,
"grad_norm": 50.67962007165623,
"learning_rate": 1.9396926207859082e-06,
"loss": 3.3379,
"step": 10
},
{
"epoch": 2.0,
"eval_loss": 2.2502517700195312,
"eval_runtime": 43.7351,
"eval_samples_per_second": 22.545,
"eval_steps_per_second": 0.366,
"step": 12
},
{
"epoch": 2.5,
"grad_norm": 45.950994283173856,
"learning_rate": 1.8660254037844386e-06,
"loss": 2.8978,
"step": 15
},
{
"epoch": 3.0,
"eval_loss": 2.073012590408325,
"eval_runtime": 45.7645,
"eval_samples_per_second": 21.545,
"eval_steps_per_second": 0.35,
"step": 18
},
{
"epoch": 3.33,
"grad_norm": 24.025617244806874,
"learning_rate": 1.766044443118978e-06,
"loss": 2.7495,
"step": 20
},
{
"epoch": 4.0,
"eval_loss": 1.9770734310150146,
"eval_runtime": 46.4207,
"eval_samples_per_second": 21.241,
"eval_steps_per_second": 0.345,
"step": 24
},
{
"epoch": 4.17,
"grad_norm": 14.021051311636999,
"learning_rate": 1.6427876096865393e-06,
"loss": 2.6253,
"step": 25
},
{
"epoch": 5.0,
"grad_norm": 11.855730015804184,
"learning_rate": 1.5e-06,
"loss": 2.5265,
"step": 30
},
{
"epoch": 5.0,
"eval_loss": 1.9128954410552979,
"eval_runtime": 45.7673,
"eval_samples_per_second": 21.544,
"eval_steps_per_second": 0.35,
"step": 30
},
{
"epoch": 5.83,
"grad_norm": 10.919199689075139,
"learning_rate": 1.3420201433256689e-06,
"loss": 2.4727,
"step": 35
},
{
"epoch": 6.0,
"eval_loss": 1.8680870532989502,
"eval_runtime": 44.5512,
"eval_samples_per_second": 22.132,
"eval_steps_per_second": 0.359,
"step": 36
},
{
"epoch": 6.67,
"grad_norm": 11.284381796941899,
"learning_rate": 1.1736481776669305e-06,
"loss": 2.443,
"step": 40
},
{
"epoch": 7.0,
"eval_loss": 1.8343719244003296,
"eval_runtime": 46.2122,
"eval_samples_per_second": 21.336,
"eval_steps_per_second": 0.346,
"step": 42
},
{
"epoch": 7.5,
"grad_norm": 9.847559856792891,
"learning_rate": 1e-06,
"loss": 2.3432,
"step": 45
},
{
"epoch": 8.0,
"eval_loss": 1.8083339929580688,
"eval_runtime": 44.9515,
"eval_samples_per_second": 21.935,
"eval_steps_per_second": 0.356,
"step": 48
},
{
"epoch": 8.33,
"grad_norm": 9.870561751066663,
"learning_rate": 8.263518223330696e-07,
"loss": 2.3291,
"step": 50
},
{
"epoch": 9.0,
"eval_loss": 1.787840485572815,
"eval_runtime": 47.3715,
"eval_samples_per_second": 20.814,
"eval_steps_per_second": 0.338,
"step": 54
},
{
"epoch": 9.17,
"grad_norm": 10.44561543342826,
"learning_rate": 6.579798566743313e-07,
"loss": 2.3369,
"step": 55
},
{
"epoch": 10.0,
"grad_norm": 10.772104047660392,
"learning_rate": 5.000000000000002e-07,
"loss": 2.2843,
"step": 60
},
{
"epoch": 10.0,
"eval_loss": 1.7719180583953857,
"eval_runtime": 47.9838,
"eval_samples_per_second": 20.549,
"eval_steps_per_second": 0.333,
"step": 60
},
{
"epoch": 10.83,
"grad_norm": 10.611454385941395,
"learning_rate": 3.5721239031346063e-07,
"loss": 2.2529,
"step": 65
},
{
"epoch": 11.0,
"eval_loss": 1.7595422267913818,
"eval_runtime": 46.6624,
"eval_samples_per_second": 21.131,
"eval_steps_per_second": 0.343,
"step": 66
},
{
"epoch": 11.67,
"grad_norm": 10.96071622929993,
"learning_rate": 2.339555568810221e-07,
"loss": 2.2723,
"step": 70
},
{
"epoch": 12.0,
"eval_loss": 1.750933289527893,
"eval_runtime": 43.9252,
"eval_samples_per_second": 22.447,
"eval_steps_per_second": 0.364,
"step": 72
},
{
"epoch": 12.5,
"grad_norm": 11.705697271981819,
"learning_rate": 1.3397459621556128e-07,
"loss": 2.2302,
"step": 75
},
{
"epoch": 13.0,
"eval_loss": 1.746525526046753,
"eval_runtime": 45.2618,
"eval_samples_per_second": 21.784,
"eval_steps_per_second": 0.353,
"step": 78
},
{
"epoch": 13.33,
"grad_norm": 11.852420845520365,
"learning_rate": 6.030737921409168e-08,
"loss": 2.2224,
"step": 80
},
{
"epoch": 14.0,
"eval_loss": 1.744821548461914,
"eval_runtime": 43.5242,
"eval_samples_per_second": 22.654,
"eval_steps_per_second": 0.368,
"step": 84
},
{
"epoch": 14.17,
"grad_norm": 11.326567147878775,
"learning_rate": 1.519224698779198e-08,
"loss": 2.2113,
"step": 85
},
{
"epoch": 15.0,
"grad_norm": 11.882251961294145,
"learning_rate": 0.0,
"loss": 2.2309,
"step": 90
},
{
"epoch": 15.0,
"eval_loss": 1.7445234060287476,
"eval_runtime": 46.8833,
"eval_samples_per_second": 21.031,
"eval_steps_per_second": 0.341,
"step": 90
},
{
"epoch": 15.0,
"step": 90,
"total_flos": 24718476312576.0,
"train_loss": 2.68501771291097,
"train_runtime": 1215.2713,
"train_samples_per_second": 9.121,
"train_steps_per_second": 0.074
}
],
"logging_steps": 5,
"max_steps": 90,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 100,
"total_flos": 24718476312576.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}