gemma-2b-sft-sass / trainer_state.json
Haining Wang
commit files to HF hub
9c5e45e
{
"best_metric": 2.009188652038574,
"best_model_checkpoint": "ckpts/sft_gemma-2b/checkpoint-1680",
"epoch": 8.865435356200528,
"eval_steps": 20,
"global_step": 1680,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10554089709762533,
"grad_norm": 6.84375,
"learning_rate": 4.000000000000001e-06,
"loss": 2.4965,
"step": 20
},
{
"epoch": 0.10554089709762533,
"eval_loss": 2.404269218444824,
"eval_runtime": 8.1633,
"eval_samples_per_second": 24.5,
"eval_steps_per_second": 6.125,
"step": 20
},
{
"epoch": 0.21108179419525067,
"grad_norm": 3.0,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2807,
"step": 40
},
{
"epoch": 0.21108179419525067,
"eval_loss": 2.1983509063720703,
"eval_runtime": 7.9364,
"eval_samples_per_second": 25.2,
"eval_steps_per_second": 6.3,
"step": 40
},
{
"epoch": 0.316622691292876,
"grad_norm": 3.0,
"learning_rate": 1e-05,
"loss": 2.1723,
"step": 60
},
{
"epoch": 0.316622691292876,
"eval_loss": 2.157801628112793,
"eval_runtime": 8.1488,
"eval_samples_per_second": 24.543,
"eval_steps_per_second": 6.136,
"step": 60
},
{
"epoch": 0.42216358839050133,
"grad_norm": 2.84375,
"learning_rate": 1e-05,
"loss": 2.0888,
"step": 80
},
{
"epoch": 0.42216358839050133,
"eval_loss": 2.1461212635040283,
"eval_runtime": 8.0327,
"eval_samples_per_second": 24.898,
"eval_steps_per_second": 6.225,
"step": 80
},
{
"epoch": 0.5277044854881267,
"grad_norm": 2.890625,
"learning_rate": 1e-05,
"loss": 2.1187,
"step": 100
},
{
"epoch": 0.5277044854881267,
"eval_loss": 2.140232801437378,
"eval_runtime": 8.0366,
"eval_samples_per_second": 24.886,
"eval_steps_per_second": 6.222,
"step": 100
},
{
"epoch": 0.633245382585752,
"grad_norm": 2.9375,
"learning_rate": 1e-05,
"loss": 2.1293,
"step": 120
},
{
"epoch": 0.633245382585752,
"eval_loss": 2.135404109954834,
"eval_runtime": 8.1159,
"eval_samples_per_second": 24.643,
"eval_steps_per_second": 6.161,
"step": 120
},
{
"epoch": 0.7387862796833773,
"grad_norm": 2.640625,
"learning_rate": 1e-05,
"loss": 2.1351,
"step": 140
},
{
"epoch": 0.7387862796833773,
"eval_loss": 2.1313905715942383,
"eval_runtime": 7.9204,
"eval_samples_per_second": 25.251,
"eval_steps_per_second": 6.313,
"step": 140
},
{
"epoch": 0.8443271767810027,
"grad_norm": 2.8125,
"learning_rate": 1e-05,
"loss": 2.1204,
"step": 160
},
{
"epoch": 0.8443271767810027,
"eval_loss": 2.1264946460723877,
"eval_runtime": 8.1968,
"eval_samples_per_second": 24.4,
"eval_steps_per_second": 6.1,
"step": 160
},
{
"epoch": 0.9498680738786279,
"grad_norm": 2.953125,
"learning_rate": 1e-05,
"loss": 2.0984,
"step": 180
},
{
"epoch": 0.9498680738786279,
"eval_loss": 2.123286247253418,
"eval_runtime": 8.0328,
"eval_samples_per_second": 24.898,
"eval_steps_per_second": 6.225,
"step": 180
},
{
"epoch": 1.0554089709762533,
"grad_norm": 3.0,
"learning_rate": 1e-05,
"loss": 2.1008,
"step": 200
},
{
"epoch": 1.0554089709762533,
"eval_loss": 2.1210150718688965,
"eval_runtime": 7.8913,
"eval_samples_per_second": 25.344,
"eval_steps_per_second": 6.336,
"step": 200
},
{
"epoch": 1.1609498680738786,
"grad_norm": 2.9375,
"learning_rate": 1e-05,
"loss": 2.0771,
"step": 220
},
{
"epoch": 1.1609498680738786,
"eval_loss": 2.118210792541504,
"eval_runtime": 8.2706,
"eval_samples_per_second": 24.182,
"eval_steps_per_second": 6.045,
"step": 220
},
{
"epoch": 1.266490765171504,
"grad_norm": 2.71875,
"learning_rate": 1e-05,
"loss": 2.0659,
"step": 240
},
{
"epoch": 1.266490765171504,
"eval_loss": 2.1160335540771484,
"eval_runtime": 8.1186,
"eval_samples_per_second": 24.635,
"eval_steps_per_second": 6.159,
"step": 240
},
{
"epoch": 1.3720316622691293,
"grad_norm": 3.390625,
"learning_rate": 1e-05,
"loss": 2.0616,
"step": 260
},
{
"epoch": 1.3720316622691293,
"eval_loss": 2.113948106765747,
"eval_runtime": 8.3043,
"eval_samples_per_second": 24.084,
"eval_steps_per_second": 6.021,
"step": 260
},
{
"epoch": 1.4775725593667546,
"grad_norm": 2.875,
"learning_rate": 1e-05,
"loss": 2.1086,
"step": 280
},
{
"epoch": 1.4775725593667546,
"eval_loss": 2.1105477809906006,
"eval_runtime": 8.3509,
"eval_samples_per_second": 23.95,
"eval_steps_per_second": 5.987,
"step": 280
},
{
"epoch": 1.58311345646438,
"grad_norm": 2.625,
"learning_rate": 1e-05,
"loss": 2.0473,
"step": 300
},
{
"epoch": 1.58311345646438,
"eval_loss": 2.1075851917266846,
"eval_runtime": 8.3203,
"eval_samples_per_second": 24.037,
"eval_steps_per_second": 6.009,
"step": 300
},
{
"epoch": 1.6886543535620053,
"grad_norm": 2.71875,
"learning_rate": 1e-05,
"loss": 2.0455,
"step": 320
},
{
"epoch": 1.6886543535620053,
"eval_loss": 2.1052379608154297,
"eval_runtime": 8.4112,
"eval_samples_per_second": 23.778,
"eval_steps_per_second": 5.944,
"step": 320
},
{
"epoch": 1.7941952506596306,
"grad_norm": 3.125,
"learning_rate": 1e-05,
"loss": 2.0664,
"step": 340
},
{
"epoch": 1.7941952506596306,
"eval_loss": 2.102696180343628,
"eval_runtime": 8.1772,
"eval_samples_per_second": 24.458,
"eval_steps_per_second": 6.115,
"step": 340
},
{
"epoch": 1.899736147757256,
"grad_norm": 2.75,
"learning_rate": 1e-05,
"loss": 2.0559,
"step": 360
},
{
"epoch": 1.899736147757256,
"eval_loss": 2.100424289703369,
"eval_runtime": 8.2651,
"eval_samples_per_second": 24.198,
"eval_steps_per_second": 6.05,
"step": 360
},
{
"epoch": 2.005277044854881,
"grad_norm": 3.125,
"learning_rate": 1e-05,
"loss": 2.0638,
"step": 380
},
{
"epoch": 2.005277044854881,
"eval_loss": 2.0989837646484375,
"eval_runtime": 8.3491,
"eval_samples_per_second": 23.955,
"eval_steps_per_second": 5.989,
"step": 380
},
{
"epoch": 2.1108179419525066,
"grad_norm": 2.890625,
"learning_rate": 1e-05,
"loss": 2.0455,
"step": 400
},
{
"epoch": 2.1108179419525066,
"eval_loss": 2.097106456756592,
"eval_runtime": 8.261,
"eval_samples_per_second": 24.21,
"eval_steps_per_second": 6.053,
"step": 400
},
{
"epoch": 2.216358839050132,
"grad_norm": 2.8125,
"learning_rate": 1e-05,
"loss": 2.0114,
"step": 420
},
{
"epoch": 2.216358839050132,
"eval_loss": 2.095724582672119,
"eval_runtime": 8.1933,
"eval_samples_per_second": 24.41,
"eval_steps_per_second": 6.103,
"step": 420
},
{
"epoch": 2.321899736147757,
"grad_norm": 2.828125,
"learning_rate": 1e-05,
"loss": 2.0263,
"step": 440
},
{
"epoch": 2.321899736147757,
"eval_loss": 2.0944039821624756,
"eval_runtime": 8.3678,
"eval_samples_per_second": 23.901,
"eval_steps_per_second": 5.975,
"step": 440
},
{
"epoch": 2.4274406332453826,
"grad_norm": 2.9375,
"learning_rate": 1e-05,
"loss": 2.0127,
"step": 460
},
{
"epoch": 2.4274406332453826,
"eval_loss": 2.0919580459594727,
"eval_runtime": 8.3065,
"eval_samples_per_second": 24.078,
"eval_steps_per_second": 6.019,
"step": 460
},
{
"epoch": 2.532981530343008,
"grad_norm": 2.484375,
"learning_rate": 1e-05,
"loss": 1.9744,
"step": 480
},
{
"epoch": 2.532981530343008,
"eval_loss": 2.087993860244751,
"eval_runtime": 8.3136,
"eval_samples_per_second": 24.057,
"eval_steps_per_second": 6.014,
"step": 480
},
{
"epoch": 2.638522427440633,
"grad_norm": 2.984375,
"learning_rate": 1e-05,
"loss": 2.0236,
"step": 500
},
{
"epoch": 2.638522427440633,
"eval_loss": 2.086052656173706,
"eval_runtime": 8.3652,
"eval_samples_per_second": 23.908,
"eval_steps_per_second": 5.977,
"step": 500
},
{
"epoch": 2.7440633245382586,
"grad_norm": 2.671875,
"learning_rate": 1e-05,
"loss": 2.0146,
"step": 520
},
{
"epoch": 2.7440633245382586,
"eval_loss": 2.08500075340271,
"eval_runtime": 8.387,
"eval_samples_per_second": 23.846,
"eval_steps_per_second": 5.962,
"step": 520
},
{
"epoch": 2.849604221635884,
"grad_norm": 2.734375,
"learning_rate": 1e-05,
"loss": 2.0086,
"step": 540
},
{
"epoch": 2.849604221635884,
"eval_loss": 2.0832457542419434,
"eval_runtime": 8.2202,
"eval_samples_per_second": 24.33,
"eval_steps_per_second": 6.083,
"step": 540
},
{
"epoch": 2.955145118733509,
"grad_norm": 3.21875,
"learning_rate": 1e-05,
"loss": 2.0381,
"step": 560
},
{
"epoch": 2.955145118733509,
"eval_loss": 2.0816333293914795,
"eval_runtime": 8.1766,
"eval_samples_per_second": 24.46,
"eval_steps_per_second": 6.115,
"step": 560
},
{
"epoch": 3.0606860158311346,
"grad_norm": 2.8125,
"learning_rate": 1e-05,
"loss": 1.9999,
"step": 580
},
{
"epoch": 3.0606860158311346,
"eval_loss": 2.0814907550811768,
"eval_runtime": 8.1023,
"eval_samples_per_second": 24.684,
"eval_steps_per_second": 6.171,
"step": 580
},
{
"epoch": 3.16622691292876,
"grad_norm": 2.953125,
"learning_rate": 1e-05,
"loss": 1.9754,
"step": 600
},
{
"epoch": 3.16622691292876,
"eval_loss": 2.0809905529022217,
"eval_runtime": 8.2702,
"eval_samples_per_second": 24.183,
"eval_steps_per_second": 6.046,
"step": 600
},
{
"epoch": 3.271767810026385,
"grad_norm": 3.09375,
"learning_rate": 1e-05,
"loss": 1.9742,
"step": 620
},
{
"epoch": 3.271767810026385,
"eval_loss": 2.0800254344940186,
"eval_runtime": 8.3829,
"eval_samples_per_second": 23.858,
"eval_steps_per_second": 5.965,
"step": 620
},
{
"epoch": 3.3773087071240107,
"grad_norm": 3.375,
"learning_rate": 1e-05,
"loss": 1.9646,
"step": 640
},
{
"epoch": 3.3773087071240107,
"eval_loss": 2.078634738922119,
"eval_runtime": 8.1957,
"eval_samples_per_second": 24.403,
"eval_steps_per_second": 6.101,
"step": 640
},
{
"epoch": 3.4828496042216357,
"grad_norm": 2.765625,
"learning_rate": 1e-05,
"loss": 1.9785,
"step": 660
},
{
"epoch": 3.4828496042216357,
"eval_loss": 2.075782537460327,
"eval_runtime": 8.2605,
"eval_samples_per_second": 24.211,
"eval_steps_per_second": 6.053,
"step": 660
},
{
"epoch": 3.588390501319261,
"grad_norm": 4.21875,
"learning_rate": 1e-05,
"loss": 1.9755,
"step": 680
},
{
"epoch": 3.588390501319261,
"eval_loss": 2.0737786293029785,
"eval_runtime": 8.3746,
"eval_samples_per_second": 23.882,
"eval_steps_per_second": 5.97,
"step": 680
},
{
"epoch": 3.6939313984168867,
"grad_norm": 2.765625,
"learning_rate": 1e-05,
"loss": 1.9667,
"step": 700
},
{
"epoch": 3.6939313984168867,
"eval_loss": 2.0726418495178223,
"eval_runtime": 8.2601,
"eval_samples_per_second": 24.213,
"eval_steps_per_second": 6.053,
"step": 700
},
{
"epoch": 3.7994722955145117,
"grad_norm": 2.828125,
"learning_rate": 1e-05,
"loss": 1.9623,
"step": 720
},
{
"epoch": 3.7994722955145117,
"eval_loss": 2.070995330810547,
"eval_runtime": 8.3051,
"eval_samples_per_second": 24.081,
"eval_steps_per_second": 6.02,
"step": 720
},
{
"epoch": 3.905013192612137,
"grad_norm": 3.03125,
"learning_rate": 1e-05,
"loss": 1.9702,
"step": 740
},
{
"epoch": 3.905013192612137,
"eval_loss": 2.068690776824951,
"eval_runtime": 8.3707,
"eval_samples_per_second": 23.893,
"eval_steps_per_second": 5.973,
"step": 740
},
{
"epoch": 4.010554089709762,
"grad_norm": 2.96875,
"learning_rate": 1e-05,
"loss": 1.9795,
"step": 760
},
{
"epoch": 4.010554089709762,
"eval_loss": 2.0664331912994385,
"eval_runtime": 9.1952,
"eval_samples_per_second": 21.75,
"eval_steps_per_second": 5.438,
"step": 760
},
{
"epoch": 4.116094986807388,
"grad_norm": 3.0,
"learning_rate": 1e-05,
"loss": 1.9469,
"step": 780
},
{
"epoch": 4.116094986807388,
"eval_loss": 2.0662195682525635,
"eval_runtime": 8.2606,
"eval_samples_per_second": 24.211,
"eval_steps_per_second": 6.053,
"step": 780
},
{
"epoch": 4.221635883905013,
"grad_norm": 2.984375,
"learning_rate": 1e-05,
"loss": 1.9415,
"step": 800
},
{
"epoch": 4.221635883905013,
"eval_loss": 2.0639867782592773,
"eval_runtime": 8.227,
"eval_samples_per_second": 24.31,
"eval_steps_per_second": 6.078,
"step": 800
},
{
"epoch": 4.327176781002638,
"grad_norm": 2.78125,
"learning_rate": 1e-05,
"loss": 1.9574,
"step": 820
},
{
"epoch": 4.327176781002638,
"eval_loss": 2.062091588973999,
"eval_runtime": 8.2262,
"eval_samples_per_second": 24.313,
"eval_steps_per_second": 6.078,
"step": 820
},
{
"epoch": 4.432717678100264,
"grad_norm": 3.203125,
"learning_rate": 1e-05,
"loss": 1.9202,
"step": 840
},
{
"epoch": 4.432717678100264,
"eval_loss": 2.0608723163604736,
"eval_runtime": 8.2161,
"eval_samples_per_second": 24.343,
"eval_steps_per_second": 6.086,
"step": 840
},
{
"epoch": 4.538258575197889,
"grad_norm": 2.96875,
"learning_rate": 1e-05,
"loss": 1.9302,
"step": 860
},
{
"epoch": 4.538258575197889,
"eval_loss": 2.058367967605591,
"eval_runtime": 8.2827,
"eval_samples_per_second": 24.147,
"eval_steps_per_second": 6.037,
"step": 860
},
{
"epoch": 4.643799472295514,
"grad_norm": 3.109375,
"learning_rate": 1e-05,
"loss": 1.9112,
"step": 880
},
{
"epoch": 4.643799472295514,
"eval_loss": 2.058866500854492,
"eval_runtime": 8.2412,
"eval_samples_per_second": 24.268,
"eval_steps_per_second": 6.067,
"step": 880
},
{
"epoch": 4.74934036939314,
"grad_norm": 2.890625,
"learning_rate": 1e-05,
"loss": 1.9127,
"step": 900
},
{
"epoch": 4.74934036939314,
"eval_loss": 2.0560219287872314,
"eval_runtime": 8.2213,
"eval_samples_per_second": 24.327,
"eval_steps_per_second": 6.082,
"step": 900
},
{
"epoch": 4.854881266490765,
"grad_norm": 2.953125,
"learning_rate": 1e-05,
"loss": 1.899,
"step": 920
},
{
"epoch": 4.854881266490765,
"eval_loss": 2.054474353790283,
"eval_runtime": 8.0902,
"eval_samples_per_second": 24.721,
"eval_steps_per_second": 6.18,
"step": 920
},
{
"epoch": 4.96042216358839,
"grad_norm": 2.90625,
"learning_rate": 1e-05,
"loss": 1.9248,
"step": 940
},
{
"epoch": 4.96042216358839,
"eval_loss": 2.052360773086548,
"eval_runtime": 8.1899,
"eval_samples_per_second": 24.42,
"eval_steps_per_second": 6.105,
"step": 940
},
{
"epoch": 5.065963060686016,
"grad_norm": 2.90625,
"learning_rate": 1e-05,
"loss": 1.8878,
"step": 960
},
{
"epoch": 5.065963060686016,
"eval_loss": 2.0526058673858643,
"eval_runtime": 8.0873,
"eval_samples_per_second": 24.73,
"eval_steps_per_second": 6.183,
"step": 960
},
{
"epoch": 5.171503957783641,
"grad_norm": 3.234375,
"learning_rate": 1e-05,
"loss": 1.8789,
"step": 980
},
{
"epoch": 5.171503957783641,
"eval_loss": 2.0522069931030273,
"eval_runtime": 8.0535,
"eval_samples_per_second": 24.834,
"eval_steps_per_second": 6.208,
"step": 980
},
{
"epoch": 5.277044854881266,
"grad_norm": 3.015625,
"learning_rate": 1e-05,
"loss": 1.8908,
"step": 1000
},
{
"epoch": 5.277044854881266,
"eval_loss": 2.051866054534912,
"eval_runtime": 7.9907,
"eval_samples_per_second": 25.029,
"eval_steps_per_second": 6.257,
"step": 1000
},
{
"epoch": 5.382585751978892,
"grad_norm": 3.046875,
"learning_rate": 1e-05,
"loss": 1.8944,
"step": 1020
},
{
"epoch": 5.382585751978892,
"eval_loss": 2.0503861904144287,
"eval_runtime": 7.927,
"eval_samples_per_second": 25.23,
"eval_steps_per_second": 6.308,
"step": 1020
},
{
"epoch": 5.488126649076517,
"grad_norm": 4.15625,
"learning_rate": 1e-05,
"loss": 1.8867,
"step": 1040
},
{
"epoch": 5.488126649076517,
"eval_loss": 2.0466654300689697,
"eval_runtime": 8.3776,
"eval_samples_per_second": 23.873,
"eval_steps_per_second": 5.968,
"step": 1040
},
{
"epoch": 5.593667546174142,
"grad_norm": 3.0,
"learning_rate": 1e-05,
"loss": 1.8764,
"step": 1060
},
{
"epoch": 5.593667546174142,
"eval_loss": 2.0448718070983887,
"eval_runtime": 8.2526,
"eval_samples_per_second": 24.235,
"eval_steps_per_second": 6.059,
"step": 1060
},
{
"epoch": 5.699208443271768,
"grad_norm": 3.140625,
"learning_rate": 1e-05,
"loss": 1.9082,
"step": 1080
},
{
"epoch": 5.699208443271768,
"eval_loss": 2.0424439907073975,
"eval_runtime": 8.0973,
"eval_samples_per_second": 24.7,
"eval_steps_per_second": 6.175,
"step": 1080
},
{
"epoch": 5.804749340369393,
"grad_norm": 3.296875,
"learning_rate": 1e-05,
"loss": 1.8782,
"step": 1100
},
{
"epoch": 5.804749340369393,
"eval_loss": 2.0422487258911133,
"eval_runtime": 8.3197,
"eval_samples_per_second": 24.039,
"eval_steps_per_second": 6.01,
"step": 1100
},
{
"epoch": 5.910290237467018,
"grad_norm": 2.984375,
"learning_rate": 1e-05,
"loss": 1.8394,
"step": 1120
},
{
"epoch": 5.910290237467018,
"eval_loss": 2.0410642623901367,
"eval_runtime": 8.2994,
"eval_samples_per_second": 24.098,
"eval_steps_per_second": 6.025,
"step": 1120
},
{
"epoch": 6.015831134564644,
"grad_norm": 4.1875,
"learning_rate": 1e-05,
"loss": 1.864,
"step": 1140
},
{
"epoch": 6.015831134564644,
"eval_loss": 2.039353370666504,
"eval_runtime": 8.2994,
"eval_samples_per_second": 24.098,
"eval_steps_per_second": 6.025,
"step": 1140
},
{
"epoch": 6.121372031662269,
"grad_norm": 3.265625,
"learning_rate": 1e-05,
"loss": 1.8246,
"step": 1160
},
{
"epoch": 6.121372031662269,
"eval_loss": 2.042710304260254,
"eval_runtime": 8.5324,
"eval_samples_per_second": 23.44,
"eval_steps_per_second": 5.86,
"step": 1160
},
{
"epoch": 6.226912928759894,
"grad_norm": 3.3125,
"learning_rate": 1e-05,
"loss": 1.8343,
"step": 1180
},
{
"epoch": 6.226912928759894,
"eval_loss": 2.0403542518615723,
"eval_runtime": 8.2652,
"eval_samples_per_second": 24.198,
"eval_steps_per_second": 6.049,
"step": 1180
},
{
"epoch": 6.33245382585752,
"grad_norm": 3.984375,
"learning_rate": 1e-05,
"loss": 1.8541,
"step": 1200
},
{
"epoch": 6.33245382585752,
"eval_loss": 2.03813099861145,
"eval_runtime": 8.4385,
"eval_samples_per_second": 23.701,
"eval_steps_per_second": 5.925,
"step": 1200
},
{
"epoch": 6.437994722955145,
"grad_norm": 3.203125,
"learning_rate": 1e-05,
"loss": 1.8182,
"step": 1220
},
{
"epoch": 6.437994722955145,
"eval_loss": 2.038771629333496,
"eval_runtime": 8.3561,
"eval_samples_per_second": 23.934,
"eval_steps_per_second": 5.984,
"step": 1220
},
{
"epoch": 6.54353562005277,
"grad_norm": 3.125,
"learning_rate": 1e-05,
"loss": 1.8427,
"step": 1240
},
{
"epoch": 6.54353562005277,
"eval_loss": 2.0339856147766113,
"eval_runtime": 8.3288,
"eval_samples_per_second": 24.013,
"eval_steps_per_second": 6.003,
"step": 1240
},
{
"epoch": 6.649076517150396,
"grad_norm": 3.15625,
"learning_rate": 1e-05,
"loss": 1.8289,
"step": 1260
},
{
"epoch": 6.649076517150396,
"eval_loss": 2.035248041152954,
"eval_runtime": 8.281,
"eval_samples_per_second": 24.152,
"eval_steps_per_second": 6.038,
"step": 1260
},
{
"epoch": 6.754617414248021,
"grad_norm": 3.4375,
"learning_rate": 1e-05,
"loss": 1.8415,
"step": 1280
},
{
"epoch": 6.754617414248021,
"eval_loss": 2.031825304031372,
"eval_runtime": 8.3052,
"eval_samples_per_second": 24.081,
"eval_steps_per_second": 6.02,
"step": 1280
},
{
"epoch": 6.860158311345646,
"grad_norm": 3.453125,
"learning_rate": 1e-05,
"loss": 1.8357,
"step": 1300
},
{
"epoch": 6.860158311345646,
"eval_loss": 2.028428316116333,
"eval_runtime": 8.3001,
"eval_samples_per_second": 24.096,
"eval_steps_per_second": 6.024,
"step": 1300
},
{
"epoch": 6.965699208443271,
"grad_norm": 3.140625,
"learning_rate": 1e-05,
"loss": 1.8324,
"step": 1320
},
{
"epoch": 6.965699208443271,
"eval_loss": 2.0289885997772217,
"eval_runtime": 8.1618,
"eval_samples_per_second": 24.504,
"eval_steps_per_second": 6.126,
"step": 1320
},
{
"epoch": 7.071240105540897,
"grad_norm": 3.5625,
"learning_rate": 1e-05,
"loss": 1.8069,
"step": 1340
},
{
"epoch": 7.071240105540897,
"eval_loss": 2.0348060131073,
"eval_runtime": 8.1951,
"eval_samples_per_second": 24.405,
"eval_steps_per_second": 6.101,
"step": 1340
},
{
"epoch": 7.176781002638522,
"grad_norm": 3.375,
"learning_rate": 1e-05,
"loss": 1.8152,
"step": 1360
},
{
"epoch": 7.176781002638522,
"eval_loss": 2.0321884155273438,
"eval_runtime": 8.2785,
"eval_samples_per_second": 24.159,
"eval_steps_per_second": 6.04,
"step": 1360
},
{
"epoch": 7.282321899736147,
"grad_norm": 3.234375,
"learning_rate": 1e-05,
"loss": 1.7871,
"step": 1380
},
{
"epoch": 7.282321899736147,
"eval_loss": 2.0307512283325195,
"eval_runtime": 8.0505,
"eval_samples_per_second": 24.843,
"eval_steps_per_second": 6.211,
"step": 1380
},
{
"epoch": 7.387862796833773,
"grad_norm": 3.59375,
"learning_rate": 1e-05,
"loss": 1.7871,
"step": 1400
},
{
"epoch": 7.387862796833773,
"eval_loss": 2.0273208618164062,
"eval_runtime": 8.1896,
"eval_samples_per_second": 24.421,
"eval_steps_per_second": 6.105,
"step": 1400
},
{
"epoch": 7.493403693931398,
"grad_norm": 3.328125,
"learning_rate": 1e-05,
"loss": 1.8076,
"step": 1420
},
{
"epoch": 7.493403693931398,
"eval_loss": 2.0257158279418945,
"eval_runtime": 7.9266,
"eval_samples_per_second": 25.232,
"eval_steps_per_second": 6.308,
"step": 1420
},
{
"epoch": 7.598944591029023,
"grad_norm": 3.28125,
"learning_rate": 1e-05,
"loss": 1.7753,
"step": 1440
},
{
"epoch": 7.598944591029023,
"eval_loss": 2.026719570159912,
"eval_runtime": 7.8566,
"eval_samples_per_second": 25.456,
"eval_steps_per_second": 6.364,
"step": 1440
},
{
"epoch": 7.704485488126649,
"grad_norm": 3.453125,
"learning_rate": 1e-05,
"loss": 1.761,
"step": 1460
},
{
"epoch": 7.704485488126649,
"eval_loss": 2.022343397140503,
"eval_runtime": 8.1505,
"eval_samples_per_second": 24.538,
"eval_steps_per_second": 6.135,
"step": 1460
},
{
"epoch": 7.810026385224274,
"grad_norm": 3.234375,
"learning_rate": 1e-05,
"loss": 1.7837,
"step": 1480
},
{
"epoch": 7.810026385224274,
"eval_loss": 2.0227696895599365,
"eval_runtime": 7.9021,
"eval_samples_per_second": 25.31,
"eval_steps_per_second": 6.327,
"step": 1480
},
{
"epoch": 7.915567282321899,
"grad_norm": 3.5625,
"learning_rate": 1e-05,
"loss": 1.7809,
"step": 1500
},
{
"epoch": 7.915567282321899,
"eval_loss": 2.0224175453186035,
"eval_runtime": 8.146,
"eval_samples_per_second": 24.552,
"eval_steps_per_second": 6.138,
"step": 1500
},
{
"epoch": 8.021108179419524,
"grad_norm": 3.59375,
"learning_rate": 1e-05,
"loss": 1.779,
"step": 1520
},
{
"epoch": 8.021108179419524,
"eval_loss": 2.0209100246429443,
"eval_runtime": 8.392,
"eval_samples_per_second": 23.832,
"eval_steps_per_second": 5.958,
"step": 1520
},
{
"epoch": 8.12664907651715,
"grad_norm": 3.109375,
"learning_rate": 1e-05,
"loss": 1.7353,
"step": 1540
},
{
"epoch": 8.12664907651715,
"eval_loss": 2.0220282077789307,
"eval_runtime": 8.6161,
"eval_samples_per_second": 23.212,
"eval_steps_per_second": 5.803,
"step": 1540
},
{
"epoch": 8.232189973614776,
"grad_norm": 3.5625,
"learning_rate": 1e-05,
"loss": 1.7363,
"step": 1560
},
{
"epoch": 8.232189973614776,
"eval_loss": 2.0166220664978027,
"eval_runtime": 8.2719,
"eval_samples_per_second": 24.178,
"eval_steps_per_second": 6.045,
"step": 1560
},
{
"epoch": 8.3377308707124,
"grad_norm": 3.71875,
"learning_rate": 1e-05,
"loss": 1.7511,
"step": 1580
},
{
"epoch": 8.3377308707124,
"eval_loss": 2.01631236076355,
"eval_runtime": 8.2537,
"eval_samples_per_second": 24.232,
"eval_steps_per_second": 6.058,
"step": 1580
},
{
"epoch": 8.443271767810026,
"grad_norm": 3.828125,
"learning_rate": 1e-05,
"loss": 1.767,
"step": 1600
},
{
"epoch": 8.443271767810026,
"eval_loss": 2.016242265701294,
"eval_runtime": 8.1762,
"eval_samples_per_second": 24.461,
"eval_steps_per_second": 6.115,
"step": 1600
},
{
"epoch": 8.548812664907652,
"grad_norm": 4.4375,
"learning_rate": 1e-05,
"loss": 1.6945,
"step": 1620
},
{
"epoch": 8.548812664907652,
"eval_loss": 2.019789218902588,
"eval_runtime": 8.4823,
"eval_samples_per_second": 23.579,
"eval_steps_per_second": 5.895,
"step": 1620
},
{
"epoch": 8.654353562005277,
"grad_norm": 4.5625,
"learning_rate": 1e-05,
"loss": 1.7087,
"step": 1640
},
{
"epoch": 8.654353562005277,
"eval_loss": 2.0212345123291016,
"eval_runtime": 8.1335,
"eval_samples_per_second": 24.59,
"eval_steps_per_second": 6.147,
"step": 1640
},
{
"epoch": 8.759894459102902,
"grad_norm": 3.921875,
"learning_rate": 1e-05,
"loss": 1.7702,
"step": 1660
},
{
"epoch": 8.759894459102902,
"eval_loss": 2.0104410648345947,
"eval_runtime": 8.0047,
"eval_samples_per_second": 24.985,
"eval_steps_per_second": 6.246,
"step": 1660
},
{
"epoch": 8.865435356200528,
"grad_norm": 4.0625,
"learning_rate": 1e-05,
"loss": 1.7563,
"step": 1680
},
{
"epoch": 8.865435356200528,
"eval_loss": 2.009188652038574,
"eval_runtime": 8.1504,
"eval_samples_per_second": 24.539,
"eval_steps_per_second": 6.135,
"step": 1680
}
],
"logging_steps": 20,
"max_steps": 9450,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 20,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.6272812961435648e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}