oh_v1.3_opengpt_x8 / trainer_state.json
gsmyrnis's picture
End of training
458648b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 825,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03636363636363636,
"grad_norm": 3.2293971017711174,
"learning_rate": 5e-06,
"loss": 1.0337,
"step": 10
},
{
"epoch": 0.07272727272727272,
"grad_norm": 1.2461654883314972,
"learning_rate": 5e-06,
"loss": 0.9092,
"step": 20
},
{
"epoch": 0.10909090909090909,
"grad_norm": 1.0937923635217501,
"learning_rate": 5e-06,
"loss": 0.8658,
"step": 30
},
{
"epoch": 0.14545454545454545,
"grad_norm": 1.3350225945199414,
"learning_rate": 5e-06,
"loss": 0.844,
"step": 40
},
{
"epoch": 0.18181818181818182,
"grad_norm": 1.029425810987488,
"learning_rate": 5e-06,
"loss": 0.8249,
"step": 50
},
{
"epoch": 0.21818181818181817,
"grad_norm": 1.5219290967515304,
"learning_rate": 5e-06,
"loss": 0.8068,
"step": 60
},
{
"epoch": 0.2545454545454545,
"grad_norm": 1.6740495880819521,
"learning_rate": 5e-06,
"loss": 0.7989,
"step": 70
},
{
"epoch": 0.2909090909090909,
"grad_norm": 1.2973735477904815,
"learning_rate": 5e-06,
"loss": 0.7921,
"step": 80
},
{
"epoch": 0.32727272727272727,
"grad_norm": 0.8566363002967183,
"learning_rate": 5e-06,
"loss": 0.781,
"step": 90
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.9961672641644985,
"learning_rate": 5e-06,
"loss": 0.7745,
"step": 100
},
{
"epoch": 0.4,
"grad_norm": 0.8927257959121373,
"learning_rate": 5e-06,
"loss": 0.7754,
"step": 110
},
{
"epoch": 0.43636363636363634,
"grad_norm": 0.6135178704985191,
"learning_rate": 5e-06,
"loss": 0.772,
"step": 120
},
{
"epoch": 0.4727272727272727,
"grad_norm": 0.7431505188106242,
"learning_rate": 5e-06,
"loss": 0.7686,
"step": 130
},
{
"epoch": 0.509090909090909,
"grad_norm": 0.7150787812569424,
"learning_rate": 5e-06,
"loss": 0.7618,
"step": 140
},
{
"epoch": 0.5454545454545454,
"grad_norm": 0.6352342662453642,
"learning_rate": 5e-06,
"loss": 0.7611,
"step": 150
},
{
"epoch": 0.5818181818181818,
"grad_norm": 0.6257901300873526,
"learning_rate": 5e-06,
"loss": 0.7569,
"step": 160
},
{
"epoch": 0.6181818181818182,
"grad_norm": 0.6387102446786417,
"learning_rate": 5e-06,
"loss": 0.7611,
"step": 170
},
{
"epoch": 0.6545454545454545,
"grad_norm": 0.5983754152683597,
"learning_rate": 5e-06,
"loss": 0.7546,
"step": 180
},
{
"epoch": 0.6909090909090909,
"grad_norm": 0.7480127979666656,
"learning_rate": 5e-06,
"loss": 0.7566,
"step": 190
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.5804396007389026,
"learning_rate": 5e-06,
"loss": 0.75,
"step": 200
},
{
"epoch": 0.7636363636363637,
"grad_norm": 0.682148918886327,
"learning_rate": 5e-06,
"loss": 0.7476,
"step": 210
},
{
"epoch": 0.8,
"grad_norm": 0.8039336411015884,
"learning_rate": 5e-06,
"loss": 0.7462,
"step": 220
},
{
"epoch": 0.8363636363636363,
"grad_norm": 0.6876607052536684,
"learning_rate": 5e-06,
"loss": 0.7411,
"step": 230
},
{
"epoch": 0.8727272727272727,
"grad_norm": 0.6588151842699974,
"learning_rate": 5e-06,
"loss": 0.7469,
"step": 240
},
{
"epoch": 0.9090909090909091,
"grad_norm": 0.6715213794720472,
"learning_rate": 5e-06,
"loss": 0.7378,
"step": 250
},
{
"epoch": 0.9454545454545454,
"grad_norm": 0.5870957383826958,
"learning_rate": 5e-06,
"loss": 0.7457,
"step": 260
},
{
"epoch": 0.9818181818181818,
"grad_norm": 0.6643986810785624,
"learning_rate": 5e-06,
"loss": 0.7466,
"step": 270
},
{
"epoch": 1.0,
"eval_loss": 0.7417545914649963,
"eval_runtime": 26.6218,
"eval_samples_per_second": 278.268,
"eval_steps_per_second": 1.089,
"step": 275
},
{
"epoch": 1.018181818181818,
"grad_norm": 0.9781239153342394,
"learning_rate": 5e-06,
"loss": 0.7136,
"step": 280
},
{
"epoch": 1.0545454545454545,
"grad_norm": 0.7152925984087143,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 290
},
{
"epoch": 1.0909090909090908,
"grad_norm": 0.6929492576277494,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 300
},
{
"epoch": 1.1272727272727272,
"grad_norm": 0.728764264622129,
"learning_rate": 5e-06,
"loss": 0.6935,
"step": 310
},
{
"epoch": 1.1636363636363636,
"grad_norm": 0.7252517543389313,
"learning_rate": 5e-06,
"loss": 0.6945,
"step": 320
},
{
"epoch": 1.2,
"grad_norm": 0.6665160391388197,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 330
},
{
"epoch": 1.2363636363636363,
"grad_norm": 0.7161659905517039,
"learning_rate": 5e-06,
"loss": 0.6898,
"step": 340
},
{
"epoch": 1.2727272727272727,
"grad_norm": 0.5719039452566653,
"learning_rate": 5e-06,
"loss": 0.6934,
"step": 350
},
{
"epoch": 1.309090909090909,
"grad_norm": 0.6060853746189843,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 360
},
{
"epoch": 1.3454545454545455,
"grad_norm": 0.6563719933283224,
"learning_rate": 5e-06,
"loss": 0.6912,
"step": 370
},
{
"epoch": 1.3818181818181818,
"grad_norm": 0.5958006047997326,
"learning_rate": 5e-06,
"loss": 0.6904,
"step": 380
},
{
"epoch": 1.4181818181818182,
"grad_norm": 0.7430218105320606,
"learning_rate": 5e-06,
"loss": 0.688,
"step": 390
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.6322073230662588,
"learning_rate": 5e-06,
"loss": 0.6883,
"step": 400
},
{
"epoch": 1.490909090909091,
"grad_norm": 0.7151221978666452,
"learning_rate": 5e-06,
"loss": 0.6934,
"step": 410
},
{
"epoch": 1.5272727272727273,
"grad_norm": 0.6184168187218901,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 420
},
{
"epoch": 1.5636363636363635,
"grad_norm": 0.6280848540221795,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 430
},
{
"epoch": 1.6,
"grad_norm": 0.6568705155050817,
"learning_rate": 5e-06,
"loss": 0.6856,
"step": 440
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.6359258851827682,
"learning_rate": 5e-06,
"loss": 0.6851,
"step": 450
},
{
"epoch": 1.6727272727272728,
"grad_norm": 0.710888538426671,
"learning_rate": 5e-06,
"loss": 0.6872,
"step": 460
},
{
"epoch": 1.709090909090909,
"grad_norm": 0.7584066029266229,
"learning_rate": 5e-06,
"loss": 0.6849,
"step": 470
},
{
"epoch": 1.7454545454545456,
"grad_norm": 0.5960492892442344,
"learning_rate": 5e-06,
"loss": 0.6891,
"step": 480
},
{
"epoch": 1.7818181818181817,
"grad_norm": 0.5629377755020811,
"learning_rate": 5e-06,
"loss": 0.6847,
"step": 490
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.589716689792314,
"learning_rate": 5e-06,
"loss": 0.6871,
"step": 500
},
{
"epoch": 1.8545454545454545,
"grad_norm": 0.5740509121739076,
"learning_rate": 5e-06,
"loss": 0.6888,
"step": 510
},
{
"epoch": 1.8909090909090909,
"grad_norm": 0.5891046247600111,
"learning_rate": 5e-06,
"loss": 0.6884,
"step": 520
},
{
"epoch": 1.9272727272727272,
"grad_norm": 0.6447276827053491,
"learning_rate": 5e-06,
"loss": 0.6893,
"step": 530
},
{
"epoch": 1.9636363636363636,
"grad_norm": 0.6935516132206995,
"learning_rate": 5e-06,
"loss": 0.6868,
"step": 540
},
{
"epoch": 2.0,
"grad_norm": 0.5781509823001448,
"learning_rate": 5e-06,
"loss": 0.6841,
"step": 550
},
{
"epoch": 2.0,
"eval_loss": 0.7281343340873718,
"eval_runtime": 26.4698,
"eval_samples_per_second": 279.867,
"eval_steps_per_second": 1.096,
"step": 550
},
{
"epoch": 2.036363636363636,
"grad_norm": 0.7551729949207574,
"learning_rate": 5e-06,
"loss": 0.6351,
"step": 560
},
{
"epoch": 2.0727272727272728,
"grad_norm": 0.6070448901420726,
"learning_rate": 5e-06,
"loss": 0.6307,
"step": 570
},
{
"epoch": 2.109090909090909,
"grad_norm": 0.7225948313371118,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 580
},
{
"epoch": 2.1454545454545455,
"grad_norm": 1.3944109200671733,
"learning_rate": 5e-06,
"loss": 0.6375,
"step": 590
},
{
"epoch": 2.1818181818181817,
"grad_norm": 1.1390572133302885,
"learning_rate": 5e-06,
"loss": 0.635,
"step": 600
},
{
"epoch": 2.2181818181818183,
"grad_norm": 0.7900509422330505,
"learning_rate": 5e-06,
"loss": 0.6383,
"step": 610
},
{
"epoch": 2.2545454545454544,
"grad_norm": 0.594871030626621,
"learning_rate": 5e-06,
"loss": 0.6321,
"step": 620
},
{
"epoch": 2.290909090909091,
"grad_norm": 0.665898906007086,
"learning_rate": 5e-06,
"loss": 0.6341,
"step": 630
},
{
"epoch": 2.327272727272727,
"grad_norm": 0.6509722897169726,
"learning_rate": 5e-06,
"loss": 0.6326,
"step": 640
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.6231670817005929,
"learning_rate": 5e-06,
"loss": 0.6385,
"step": 650
},
{
"epoch": 2.4,
"grad_norm": 0.6425410588561774,
"learning_rate": 5e-06,
"loss": 0.6373,
"step": 660
},
{
"epoch": 2.4363636363636365,
"grad_norm": 0.621241338432262,
"learning_rate": 5e-06,
"loss": 0.6399,
"step": 670
},
{
"epoch": 2.4727272727272727,
"grad_norm": 0.6924233110335524,
"learning_rate": 5e-06,
"loss": 0.6393,
"step": 680
},
{
"epoch": 2.509090909090909,
"grad_norm": 0.6419114963815122,
"learning_rate": 5e-06,
"loss": 0.6405,
"step": 690
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.7336852368102121,
"learning_rate": 5e-06,
"loss": 0.6385,
"step": 700
},
{
"epoch": 2.581818181818182,
"grad_norm": 0.7922288944252411,
"learning_rate": 5e-06,
"loss": 0.6377,
"step": 710
},
{
"epoch": 2.618181818181818,
"grad_norm": 0.6500377491351792,
"learning_rate": 5e-06,
"loss": 0.6427,
"step": 720
},
{
"epoch": 2.6545454545454543,
"grad_norm": 0.6853834065254241,
"learning_rate": 5e-06,
"loss": 0.6346,
"step": 730
},
{
"epoch": 2.690909090909091,
"grad_norm": 0.8156333668312422,
"learning_rate": 5e-06,
"loss": 0.6414,
"step": 740
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.6294215183471213,
"learning_rate": 5e-06,
"loss": 0.6363,
"step": 750
},
{
"epoch": 2.7636363636363637,
"grad_norm": 0.8237171162592375,
"learning_rate": 5e-06,
"loss": 0.6421,
"step": 760
},
{
"epoch": 2.8,
"grad_norm": 0.6772752476166749,
"learning_rate": 5e-06,
"loss": 0.6356,
"step": 770
},
{
"epoch": 2.8363636363636364,
"grad_norm": 0.7780500988065099,
"learning_rate": 5e-06,
"loss": 0.6425,
"step": 780
},
{
"epoch": 2.8727272727272726,
"grad_norm": 0.6862874007983163,
"learning_rate": 5e-06,
"loss": 0.6368,
"step": 790
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.5748210856771035,
"learning_rate": 5e-06,
"loss": 0.6405,
"step": 800
},
{
"epoch": 2.9454545454545453,
"grad_norm": 0.6351457621560951,
"learning_rate": 5e-06,
"loss": 0.6357,
"step": 810
},
{
"epoch": 2.981818181818182,
"grad_norm": 0.586627253325874,
"learning_rate": 5e-06,
"loss": 0.6412,
"step": 820
},
{
"epoch": 3.0,
"eval_loss": 0.7299705147743225,
"eval_runtime": 25.9228,
"eval_samples_per_second": 285.772,
"eval_steps_per_second": 1.119,
"step": 825
},
{
"epoch": 3.0,
"step": 825,
"total_flos": 1381905727488000.0,
"train_loss": 0.7044297796307188,
"train_runtime": 5353.5806,
"train_samples_per_second": 78.866,
"train_steps_per_second": 0.154
}
],
"logging_steps": 10,
"max_steps": 825,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1381905727488000.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}