roberta-tiny-8l-10M / trainer_state.json
g8a9's picture
End of training
194848b
{
"best_metric": 7.342555999755859,
"best_model_checkpoint": "/data1/attanasiog/babylm/roberta-tiny-8l-10M/checkpoint-700",
"epoch": 17.698779704560053,
"global_step": 850,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.21,
"learning_rate": 8e-05,
"loss": 10.2998,
"step": 10
},
{
"epoch": 0.41,
"learning_rate": 0.00016,
"loss": 8.9979,
"step": 20
},
{
"epoch": 0.62,
"learning_rate": 0.00024,
"loss": 7.8015,
"step": 30
},
{
"epoch": 0.82,
"learning_rate": 0.00032,
"loss": 7.3376,
"step": 40
},
{
"epoch": 1.04,
"learning_rate": 0.0004,
"loss": 7.8102,
"step": 50
},
{
"epoch": 1.04,
"eval_accuracy": 0.05136765891155645,
"eval_loss": 7.374657154083252,
"eval_runtime": 180.8913,
"eval_samples_per_second": 132.98,
"eval_steps_per_second": 4.157,
"step": 50
},
{
"epoch": 1.25,
"learning_rate": 0.000399995625676045,
"loss": 7.3419,
"step": 60
},
{
"epoch": 1.45,
"learning_rate": 0.0003999825028955268,
"loss": 8.1652,
"step": 70
},
{
"epoch": 1.66,
"learning_rate": 0.0003999606322324786,
"loss": 8.4182,
"step": 80
},
{
"epoch": 1.86,
"learning_rate": 0.0003999300146435939,
"loss": 7.3249,
"step": 90
},
{
"epoch": 2.08,
"learning_rate": 0.00039989065146818525,
"loss": 7.805,
"step": 100
},
{
"epoch": 2.08,
"eval_accuracy": 0.051684268514440884,
"eval_loss": 7.369903087615967,
"eval_runtime": 181.0798,
"eval_samples_per_second": 132.842,
"eval_steps_per_second": 4.153,
"step": 100
},
{
"epoch": 2.29,
"learning_rate": 0.0003998425444281255,
"loss": 7.3101,
"step": 110
},
{
"epoch": 2.49,
"learning_rate": 0.00039978569562777234,
"loss": 7.3232,
"step": 120
},
{
"epoch": 2.7,
"learning_rate": 0.0003997201075538765,
"loss": 7.3073,
"step": 130
},
{
"epoch": 2.9,
"learning_rate": 0.0003996457830754729,
"loss": 7.3236,
"step": 140
},
{
"epoch": 3.12,
"learning_rate": 0.00039956272544375493,
"loss": 7.7907,
"step": 150
},
{
"epoch": 3.12,
"eval_accuracy": 0.05174263561361906,
"eval_loss": 7.35952091217041,
"eval_runtime": 180.7769,
"eval_samples_per_second": 133.065,
"eval_steps_per_second": 4.16,
"step": 150
},
{
"epoch": 3.33,
"learning_rate": 0.00039947093829193245,
"loss": 7.2981,
"step": 160
},
{
"epoch": 3.53,
"learning_rate": 0.00039937042563507283,
"loss": 7.3259,
"step": 170
},
{
"epoch": 3.74,
"learning_rate": 0.00039926119186992537,
"loss": 7.3352,
"step": 180
},
{
"epoch": 3.95,
"learning_rate": 0.0003991432417747288,
"loss": 7.3069,
"step": 190
},
{
"epoch": 4.16,
"learning_rate": 0.0003990165805090023,
"loss": 7.7838,
"step": 200
},
{
"epoch": 4.16,
"eval_accuracy": 0.05138188801155976,
"eval_loss": 7.361721992492676,
"eval_runtime": 180.6907,
"eval_samples_per_second": 133.128,
"eval_steps_per_second": 4.162,
"step": 200
},
{
"epoch": 4.37,
"learning_rate": 0.00039888121361332003,
"loss": 7.3066,
"step": 210
},
{
"epoch": 4.58,
"learning_rate": 0.0003987371470090686,
"loss": 7.3237,
"step": 220
},
{
"epoch": 4.78,
"learning_rate": 0.00039858438699818784,
"loss": 7.3209,
"step": 230
},
{
"epoch": 4.99,
"learning_rate": 0.0003984229402628956,
"loss": 7.3024,
"step": 240
},
{
"epoch": 5.21,
"learning_rate": 0.00039825281386539503,
"loss": 7.7706,
"step": 250
},
{
"epoch": 5.21,
"eval_accuracy": 0.05140231728427503,
"eval_loss": 7.358623504638672,
"eval_runtime": 180.7786,
"eval_samples_per_second": 133.063,
"eval_steps_per_second": 4.16,
"step": 250
},
{
"epoch": 5.41,
"learning_rate": 0.000398074015247566,
"loss": 7.3135,
"step": 260
},
{
"epoch": 5.62,
"learning_rate": 0.0003978865522306392,
"loss": 7.3003,
"step": 270
},
{
"epoch": 5.82,
"learning_rate": 0.0003976904330148543,
"loss": 7.3159,
"step": 280
},
{
"epoch": 6.04,
"learning_rate": 0.00039748566617910113,
"loss": 7.7967,
"step": 290
},
{
"epoch": 6.25,
"learning_rate": 0.0003972722606805445,
"loss": 7.2933,
"step": 300
},
{
"epoch": 6.25,
"eval_accuracy": 0.05126180317018771,
"eval_loss": 7.356584548950195,
"eval_runtime": 180.7497,
"eval_samples_per_second": 133.085,
"eval_steps_per_second": 4.16,
"step": 300
},
{
"epoch": 6.45,
"learning_rate": 0.00039705022585423216,
"loss": 7.3163,
"step": 310
},
{
"epoch": 6.66,
"learning_rate": 0.0003968195714126868,
"loss": 7.2904,
"step": 320
},
{
"epoch": 6.86,
"learning_rate": 0.00039658030744548075,
"loss": 7.3045,
"step": 330
},
{
"epoch": 7.08,
"learning_rate": 0.0003963324444187952,
"loss": 7.7849,
"step": 340
},
{
"epoch": 7.29,
"learning_rate": 0.0003960759931749619,
"loss": 7.2932,
"step": 350
},
{
"epoch": 7.29,
"eval_accuracy": 0.05161072401384023,
"eval_loss": 7.3526611328125,
"eval_runtime": 180.6553,
"eval_samples_per_second": 133.154,
"eval_steps_per_second": 4.163,
"step": 350
},
{
"epoch": 7.49,
"learning_rate": 0.00039581096493198893,
"loss": 7.3057,
"step": 360
},
{
"epoch": 7.7,
"learning_rate": 0.0003955373712830703,
"loss": 7.3002,
"step": 370
},
{
"epoch": 7.9,
"learning_rate": 0.00039525522419607854,
"loss": 7.3029,
"step": 380
},
{
"epoch": 8.12,
"learning_rate": 0.0003949645360130412,
"loss": 7.7765,
"step": 390
},
{
"epoch": 8.33,
"learning_rate": 0.0003946653194496012,
"loss": 7.2986,
"step": 400
},
{
"epoch": 8.33,
"eval_accuracy": 0.051572554180051966,
"eval_loss": 7.356107234954834,
"eval_runtime": 180.5938,
"eval_samples_per_second": 133.199,
"eval_steps_per_second": 4.164,
"step": 400
},
{
"epoch": 8.53,
"learning_rate": 0.00039435758759446025,
"loss": 7.3093,
"step": 410
},
{
"epoch": 8.74,
"learning_rate": 0.00039404135390880664,
"loss": 7.294,
"step": 420
},
{
"epoch": 8.95,
"learning_rate": 0.0003937166322257262,
"loss": 7.3083,
"step": 430
},
{
"epoch": 9.16,
"learning_rate": 0.00039338343674959745,
"loss": 7.7912,
"step": 440
},
{
"epoch": 9.37,
"learning_rate": 0.00039304178205546976,
"loss": 7.289,
"step": 450
},
{
"epoch": 9.37,
"eval_accuracy": 0.05145224079666028,
"eval_loss": 7.34950590133667,
"eval_runtime": 180.7201,
"eval_samples_per_second": 133.106,
"eval_steps_per_second": 4.161,
"step": 450
},
{
"epoch": 9.58,
"learning_rate": 0.00039269168308842634,
"loss": 7.3004,
"step": 460
},
{
"epoch": 9.78,
"learning_rate": 0.00039233315516293006,
"loss": 7.2938,
"step": 470
},
{
"epoch": 9.99,
"learning_rate": 0.00039196621396215403,
"loss": 7.2897,
"step": 480
},
{
"epoch": 10.21,
"learning_rate": 0.000391590875537295,
"loss": 7.7652,
"step": 490
},
{
"epoch": 10.41,
"learning_rate": 0.00039120715630687155,
"loss": 7.2879,
"step": 500
},
{
"epoch": 10.41,
"eval_accuracy": 0.05138556381472711,
"eval_loss": 7.3455071449279785,
"eval_runtime": 180.6339,
"eval_samples_per_second": 133.17,
"eval_steps_per_second": 4.163,
"step": 500
},
{
"epoch": 10.62,
"learning_rate": 0.000390815073056006,
"loss": 7.2942,
"step": 510
},
{
"epoch": 10.82,
"learning_rate": 0.00039041464293568983,
"loss": 7.306,
"step": 520
},
{
"epoch": 11.04,
"learning_rate": 0.00039000588346203374,
"loss": 7.7754,
"step": 530
},
{
"epoch": 11.25,
"learning_rate": 0.0003895888125155014,
"loss": 7.2912,
"step": 540
},
{
"epoch": 11.45,
"learning_rate": 0.00038916344834012695,
"loss": 7.276,
"step": 550
},
{
"epoch": 11.45,
"eval_accuracy": 0.05130612004196204,
"eval_loss": 7.347738265991211,
"eval_runtime": 180.7636,
"eval_samples_per_second": 133.074,
"eval_steps_per_second": 4.16,
"step": 550
},
{
"epoch": 11.66,
"learning_rate": 0.00038872980954271757,
"loss": 7.3135,
"step": 560
},
{
"epoch": 11.86,
"learning_rate": 0.00038828791509203895,
"loss": 7.2859,
"step": 570
},
{
"epoch": 12.08,
"learning_rate": 0.00038783778431798597,
"loss": 7.7845,
"step": 580
},
{
"epoch": 12.29,
"learning_rate": 0.0003873794369107369,
"loss": 7.2966,
"step": 590
},
{
"epoch": 12.49,
"learning_rate": 0.0003869128929198922,
"loss": 7.3072,
"step": 600
},
{
"epoch": 12.49,
"eval_accuracy": 0.051627819878485845,
"eval_loss": 7.344621658325195,
"eval_runtime": 180.6519,
"eval_samples_per_second": 133.157,
"eval_steps_per_second": 4.163,
"step": 600
},
{
"epoch": 12.7,
"learning_rate": 0.0003864381727535973,
"loss": 7.3026,
"step": 610
},
{
"epoch": 12.9,
"learning_rate": 0.00038595529717765027,
"loss": 7.2966,
"step": 620
},
{
"epoch": 13.12,
"learning_rate": 0.0003854642873145931,
"loss": 7.7848,
"step": 630
},
{
"epoch": 13.33,
"learning_rate": 0.00038496516464278776,
"loss": 7.2964,
"step": 640
},
{
"epoch": 13.53,
"learning_rate": 0.00038445795099547697,
"loss": 7.2978,
"step": 650
},
{
"epoch": 13.53,
"eval_accuracy": 0.05143096217098587,
"eval_loss": 7.346319198608398,
"eval_runtime": 180.763,
"eval_samples_per_second": 133.075,
"eval_steps_per_second": 4.16,
"step": 650
},
{
"epoch": 13.74,
"learning_rate": 0.0003839426685598287,
"loss": 7.2919,
"step": 660
},
{
"epoch": 13.95,
"learning_rate": 0.000383419339875966,
"loss": 7.3006,
"step": 670
},
{
"epoch": 14.16,
"learning_rate": 0.00038288798783598087,
"loss": 7.7738,
"step": 680
},
{
"epoch": 14.37,
"learning_rate": 0.0003823486356829329,
"loss": 7.2839,
"step": 690
},
{
"epoch": 14.58,
"learning_rate": 0.0003818013070098325,
"loss": 7.2857,
"step": 700
},
{
"epoch": 14.58,
"eval_accuracy": 0.05146984844436126,
"eval_loss": 7.342555999755859,
"eval_runtime": 180.8063,
"eval_samples_per_second": 133.043,
"eval_steps_per_second": 4.159,
"step": 700
},
{
"epoch": 14.78,
"learning_rate": 0.0003812460257586089,
"loss": 7.2949,
"step": 710
},
{
"epoch": 14.99,
"learning_rate": 0.000380682816219063,
"loss": 7.3249,
"step": 720
},
{
"epoch": 15.21,
"learning_rate": 0.00038011170302780446,
"loss": 7.7486,
"step": 730
},
{
"epoch": 15.41,
"learning_rate": 0.00037953271116717444,
"loss": 7.2879,
"step": 740
},
{
"epoch": 15.62,
"learning_rate": 0.0003789458659641527,
"loss": 7.2868,
"step": 750
},
{
"epoch": 15.62,
"eval_accuracy": 0.05147383671825258,
"eval_loss": 7.343778610229492,
"eval_runtime": 180.8254,
"eval_samples_per_second": 133.029,
"eval_steps_per_second": 4.159,
"step": 750
},
{
"epoch": 15.82,
"learning_rate": 0.0003783511930892495,
"loss": 7.2986,
"step": 760
},
{
"epoch": 16.04,
"learning_rate": 0.00037774871855538275,
"loss": 7.7788,
"step": 770
},
{
"epoch": 16.25,
"learning_rate": 0.00037713846871674045,
"loss": 7.2858,
"step": 780
},
{
"epoch": 16.45,
"learning_rate": 0.0003765204702676274,
"loss": 7.2937,
"step": 790
},
{
"epoch": 16.66,
"learning_rate": 0.0003758947502412978,
"loss": 7.2973,
"step": 800
},
{
"epoch": 16.66,
"eval_accuracy": 0.051658592501666364,
"eval_loss": 7.344185829162598,
"eval_runtime": 180.7375,
"eval_samples_per_second": 133.094,
"eval_steps_per_second": 4.161,
"step": 800
},
{
"epoch": 16.86,
"learning_rate": 0.0003752613360087727,
"loss": 7.3043,
"step": 810
},
{
"epoch": 17.08,
"learning_rate": 0.00037462025527764265,
"loss": 7.7616,
"step": 820
},
{
"epoch": 17.29,
"learning_rate": 0.00037397153609085553,
"loss": 7.2869,
"step": 830
},
{
"epoch": 17.49,
"learning_rate": 0.0003733152068254901,
"loss": 7.2798,
"step": 840
},
{
"epoch": 17.7,
"learning_rate": 0.00037265129619151483,
"loss": 7.2988,
"step": 850
},
{
"epoch": 17.7,
"eval_accuracy": 0.051239018394020945,
"eval_loss": 7.343734264373779,
"eval_runtime": 180.5675,
"eval_samples_per_second": 133.219,
"eval_steps_per_second": 4.165,
"step": 850
},
{
"epoch": 17.7,
"step": 850,
"total_flos": 1.1524171581514752e+17,
"train_loss": 7.482659651812385,
"train_runtime": 11122.8848,
"train_samples_per_second": 223.953,
"train_steps_per_second": 0.432
}
],
"max_steps": 4800,
"num_train_epochs": 100,
"total_flos": 1.1524171581514752e+17,
"trial_name": null,
"trial_params": null
}