oh-dcft-v1.1-no-curation / trainer_state.json
sedrickkeh's picture
End of training
6d31776 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9980976537729864,
"eval_steps": 500,
"global_step": 1182,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025364616360177554,
"grad_norm": 2.4958535272810582,
"learning_rate": 5e-06,
"loss": 0.7847,
"step": 10
},
{
"epoch": 0.05072923272035511,
"grad_norm": 1.332301089693953,
"learning_rate": 5e-06,
"loss": 0.6565,
"step": 20
},
{
"epoch": 0.07609384908053266,
"grad_norm": 0.8570758334622178,
"learning_rate": 5e-06,
"loss": 0.6087,
"step": 30
},
{
"epoch": 0.10145846544071022,
"grad_norm": 0.7961220667217911,
"learning_rate": 5e-06,
"loss": 0.5923,
"step": 40
},
{
"epoch": 0.12682308180088775,
"grad_norm": 1.0397565268140934,
"learning_rate": 5e-06,
"loss": 0.5693,
"step": 50
},
{
"epoch": 0.1521876981610653,
"grad_norm": 0.8562621904197287,
"learning_rate": 5e-06,
"loss": 0.5503,
"step": 60
},
{
"epoch": 0.17755231452124287,
"grad_norm": 0.8886037080258321,
"learning_rate": 5e-06,
"loss": 0.5498,
"step": 70
},
{
"epoch": 0.20291693088142043,
"grad_norm": 0.6838633568199429,
"learning_rate": 5e-06,
"loss": 0.534,
"step": 80
},
{
"epoch": 0.22828154724159797,
"grad_norm": 0.6556349806544307,
"learning_rate": 5e-06,
"loss": 0.535,
"step": 90
},
{
"epoch": 0.2536461636017755,
"grad_norm": 0.7469146203238889,
"learning_rate": 5e-06,
"loss": 0.5333,
"step": 100
},
{
"epoch": 0.27901077996195306,
"grad_norm": 0.5122739271343442,
"learning_rate": 5e-06,
"loss": 0.5172,
"step": 110
},
{
"epoch": 0.3043753963221306,
"grad_norm": 0.9622908612260581,
"learning_rate": 5e-06,
"loss": 0.5257,
"step": 120
},
{
"epoch": 0.3297400126823082,
"grad_norm": 0.6046903397133303,
"learning_rate": 5e-06,
"loss": 0.5151,
"step": 130
},
{
"epoch": 0.35510462904248574,
"grad_norm": 0.47575500456135494,
"learning_rate": 5e-06,
"loss": 0.5143,
"step": 140
},
{
"epoch": 0.3804692454026633,
"grad_norm": 0.7013205926571314,
"learning_rate": 5e-06,
"loss": 0.5144,
"step": 150
},
{
"epoch": 0.40583386176284086,
"grad_norm": 0.6351422540630048,
"learning_rate": 5e-06,
"loss": 0.5058,
"step": 160
},
{
"epoch": 0.43119847812301837,
"grad_norm": 0.6261693017885483,
"learning_rate": 5e-06,
"loss": 0.5102,
"step": 170
},
{
"epoch": 0.45656309448319593,
"grad_norm": 0.7605740230985341,
"learning_rate": 5e-06,
"loss": 0.5078,
"step": 180
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.5845861272533613,
"learning_rate": 5e-06,
"loss": 0.5043,
"step": 190
},
{
"epoch": 0.507292327203551,
"grad_norm": 0.7171099558889358,
"learning_rate": 5e-06,
"loss": 0.5053,
"step": 200
},
{
"epoch": 0.5326569435637286,
"grad_norm": 0.5158539718235372,
"learning_rate": 5e-06,
"loss": 0.5066,
"step": 210
},
{
"epoch": 0.5580215599239061,
"grad_norm": 0.7716179700630799,
"learning_rate": 5e-06,
"loss": 0.5037,
"step": 220
},
{
"epoch": 0.5833861762840837,
"grad_norm": 0.5356639716385265,
"learning_rate": 5e-06,
"loss": 0.5077,
"step": 230
},
{
"epoch": 0.6087507926442612,
"grad_norm": 0.5909560901543055,
"learning_rate": 5e-06,
"loss": 0.4978,
"step": 240
},
{
"epoch": 0.6341154090044389,
"grad_norm": 0.5612449176342577,
"learning_rate": 5e-06,
"loss": 0.4955,
"step": 250
},
{
"epoch": 0.6594800253646164,
"grad_norm": 0.8220158158926282,
"learning_rate": 5e-06,
"loss": 0.4932,
"step": 260
},
{
"epoch": 0.6848446417247939,
"grad_norm": 0.9803427711154427,
"learning_rate": 5e-06,
"loss": 0.4935,
"step": 270
},
{
"epoch": 0.7102092580849715,
"grad_norm": 0.7003489682973207,
"learning_rate": 5e-06,
"loss": 0.4921,
"step": 280
},
{
"epoch": 0.735573874445149,
"grad_norm": 0.7155818668831541,
"learning_rate": 5e-06,
"loss": 0.4941,
"step": 290
},
{
"epoch": 0.7609384908053266,
"grad_norm": 0.5641884255018443,
"learning_rate": 5e-06,
"loss": 0.4905,
"step": 300
},
{
"epoch": 0.7863031071655041,
"grad_norm": 0.5667685684791592,
"learning_rate": 5e-06,
"loss": 0.4972,
"step": 310
},
{
"epoch": 0.8116677235256817,
"grad_norm": 0.5424782856163526,
"learning_rate": 5e-06,
"loss": 0.4908,
"step": 320
},
{
"epoch": 0.8370323398858592,
"grad_norm": 0.555119069867457,
"learning_rate": 5e-06,
"loss": 0.49,
"step": 330
},
{
"epoch": 0.8623969562460367,
"grad_norm": 0.5540403091132209,
"learning_rate": 5e-06,
"loss": 0.4892,
"step": 340
},
{
"epoch": 0.8877615726062144,
"grad_norm": 0.6718528259146384,
"learning_rate": 5e-06,
"loss": 0.4879,
"step": 350
},
{
"epoch": 0.9131261889663919,
"grad_norm": 0.48504592421103015,
"learning_rate": 5e-06,
"loss": 0.4866,
"step": 360
},
{
"epoch": 0.9384908053265695,
"grad_norm": 0.5794400662308987,
"learning_rate": 5e-06,
"loss": 0.489,
"step": 370
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.49175786205010735,
"learning_rate": 5e-06,
"loss": 0.4792,
"step": 380
},
{
"epoch": 0.9892200380469245,
"grad_norm": 0.48088824717550854,
"learning_rate": 5e-06,
"loss": 0.4793,
"step": 390
},
{
"epoch": 0.9993658845909955,
"eval_loss": 0.48458319902420044,
"eval_runtime": 140.5522,
"eval_samples_per_second": 75.552,
"eval_steps_per_second": 0.591,
"step": 394
},
{
"epoch": 1.014584654407102,
"grad_norm": 0.6241434976553506,
"learning_rate": 5e-06,
"loss": 0.4668,
"step": 400
},
{
"epoch": 1.0399492707672797,
"grad_norm": 0.5387091155966651,
"learning_rate": 5e-06,
"loss": 0.4467,
"step": 410
},
{
"epoch": 1.0653138871274572,
"grad_norm": 0.6088667420403366,
"learning_rate": 5e-06,
"loss": 0.4552,
"step": 420
},
{
"epoch": 1.0906785034876347,
"grad_norm": 0.7635188991702534,
"learning_rate": 5e-06,
"loss": 0.4569,
"step": 430
},
{
"epoch": 1.1160431198478122,
"grad_norm": 0.5202613636726365,
"learning_rate": 5e-06,
"loss": 0.4532,
"step": 440
},
{
"epoch": 1.1414077362079897,
"grad_norm": 0.5431289298627378,
"learning_rate": 5e-06,
"loss": 0.4552,
"step": 450
},
{
"epoch": 1.1667723525681675,
"grad_norm": 0.5447516747773636,
"learning_rate": 5e-06,
"loss": 0.4517,
"step": 460
},
{
"epoch": 1.192136968928345,
"grad_norm": 0.5811733767557097,
"learning_rate": 5e-06,
"loss": 0.4596,
"step": 470
},
{
"epoch": 1.2175015852885225,
"grad_norm": 0.5291374404256166,
"learning_rate": 5e-06,
"loss": 0.4523,
"step": 480
},
{
"epoch": 1.2428662016487,
"grad_norm": 0.920406850160634,
"learning_rate": 5e-06,
"loss": 0.4512,
"step": 490
},
{
"epoch": 1.2682308180088775,
"grad_norm": 0.5379277068224477,
"learning_rate": 5e-06,
"loss": 0.4589,
"step": 500
},
{
"epoch": 1.2935954343690552,
"grad_norm": 0.6084288782824112,
"learning_rate": 5e-06,
"loss": 0.4476,
"step": 510
},
{
"epoch": 1.3189600507292327,
"grad_norm": 0.6373203390142074,
"learning_rate": 5e-06,
"loss": 0.4508,
"step": 520
},
{
"epoch": 1.3443246670894102,
"grad_norm": 0.5297816500484004,
"learning_rate": 5e-06,
"loss": 0.4519,
"step": 530
},
{
"epoch": 1.369689283449588,
"grad_norm": 0.5214550304276996,
"learning_rate": 5e-06,
"loss": 0.4507,
"step": 540
},
{
"epoch": 1.3950538998097652,
"grad_norm": 0.5932937282969459,
"learning_rate": 5e-06,
"loss": 0.4508,
"step": 550
},
{
"epoch": 1.420418516169943,
"grad_norm": 0.5015573262400715,
"learning_rate": 5e-06,
"loss": 0.4529,
"step": 560
},
{
"epoch": 1.4457831325301205,
"grad_norm": 0.6541003393290922,
"learning_rate": 5e-06,
"loss": 0.4487,
"step": 570
},
{
"epoch": 1.471147748890298,
"grad_norm": 0.4738510019221813,
"learning_rate": 5e-06,
"loss": 0.4437,
"step": 580
},
{
"epoch": 1.4965123652504757,
"grad_norm": 0.5284328908203406,
"learning_rate": 5e-06,
"loss": 0.449,
"step": 590
},
{
"epoch": 1.521876981610653,
"grad_norm": 0.5814801147707117,
"learning_rate": 5e-06,
"loss": 0.4498,
"step": 600
},
{
"epoch": 1.5472415979708307,
"grad_norm": 0.7380939259733779,
"learning_rate": 5e-06,
"loss": 0.4574,
"step": 610
},
{
"epoch": 1.5726062143310082,
"grad_norm": 0.5158189079851289,
"learning_rate": 5e-06,
"loss": 0.4553,
"step": 620
},
{
"epoch": 1.5979708306911857,
"grad_norm": 0.7517859976181999,
"learning_rate": 5e-06,
"loss": 0.4479,
"step": 630
},
{
"epoch": 1.6233354470513635,
"grad_norm": 0.4624484508717309,
"learning_rate": 5e-06,
"loss": 0.4484,
"step": 640
},
{
"epoch": 1.6487000634115407,
"grad_norm": 0.6517886187802472,
"learning_rate": 5e-06,
"loss": 0.4479,
"step": 650
},
{
"epoch": 1.6740646797717185,
"grad_norm": 0.5168694302612785,
"learning_rate": 5e-06,
"loss": 0.4498,
"step": 660
},
{
"epoch": 1.699429296131896,
"grad_norm": 0.5442235822761647,
"learning_rate": 5e-06,
"loss": 0.4546,
"step": 670
},
{
"epoch": 1.7247939124920735,
"grad_norm": 0.5866332538354704,
"learning_rate": 5e-06,
"loss": 0.4502,
"step": 680
},
{
"epoch": 1.7501585288522512,
"grad_norm": 0.5771993285709256,
"learning_rate": 5e-06,
"loss": 0.4489,
"step": 690
},
{
"epoch": 1.7755231452124287,
"grad_norm": 0.5856601574541924,
"learning_rate": 5e-06,
"loss": 0.4516,
"step": 700
},
{
"epoch": 1.8008877615726062,
"grad_norm": 0.5219735572020098,
"learning_rate": 5e-06,
"loss": 0.4504,
"step": 710
},
{
"epoch": 1.8262523779327837,
"grad_norm": 0.5294326989128105,
"learning_rate": 5e-06,
"loss": 0.4512,
"step": 720
},
{
"epoch": 1.8516169942929612,
"grad_norm": 0.5043747110843602,
"learning_rate": 5e-06,
"loss": 0.4476,
"step": 730
},
{
"epoch": 1.876981610653139,
"grad_norm": 0.5243372113736201,
"learning_rate": 5e-06,
"loss": 0.4487,
"step": 740
},
{
"epoch": 1.9023462270133165,
"grad_norm": 0.46489075414726855,
"learning_rate": 5e-06,
"loss": 0.4477,
"step": 750
},
{
"epoch": 1.927710843373494,
"grad_norm": 0.47070137502563003,
"learning_rate": 5e-06,
"loss": 0.4491,
"step": 760
},
{
"epoch": 1.9530754597336717,
"grad_norm": 0.5114250833346574,
"learning_rate": 5e-06,
"loss": 0.4497,
"step": 770
},
{
"epoch": 1.978440076093849,
"grad_norm": 0.44673587993328173,
"learning_rate": 5e-06,
"loss": 0.4461,
"step": 780
},
{
"epoch": 1.9987317691819912,
"eval_loss": 0.4722590744495392,
"eval_runtime": 136.3111,
"eval_samples_per_second": 77.903,
"eval_steps_per_second": 0.609,
"step": 788
},
{
"epoch": 2.0038046924540267,
"grad_norm": 0.5253387493826779,
"learning_rate": 5e-06,
"loss": 0.4406,
"step": 790
},
{
"epoch": 2.029169308814204,
"grad_norm": 0.5649979072148124,
"learning_rate": 5e-06,
"loss": 0.4174,
"step": 800
},
{
"epoch": 2.0545339251743817,
"grad_norm": 0.554555886277626,
"learning_rate": 5e-06,
"loss": 0.416,
"step": 810
},
{
"epoch": 2.0798985415345594,
"grad_norm": 0.5380213608538502,
"learning_rate": 5e-06,
"loss": 0.4075,
"step": 820
},
{
"epoch": 2.1052631578947367,
"grad_norm": 0.6590004861365489,
"learning_rate": 5e-06,
"loss": 0.415,
"step": 830
},
{
"epoch": 2.1306277742549145,
"grad_norm": 0.5145129946467305,
"learning_rate": 5e-06,
"loss": 0.4096,
"step": 840
},
{
"epoch": 2.1559923906150917,
"grad_norm": 0.572199886696882,
"learning_rate": 5e-06,
"loss": 0.4189,
"step": 850
},
{
"epoch": 2.1813570069752695,
"grad_norm": 0.5756593969633285,
"learning_rate": 5e-06,
"loss": 0.4201,
"step": 860
},
{
"epoch": 2.206721623335447,
"grad_norm": 0.5265898189979799,
"learning_rate": 5e-06,
"loss": 0.4116,
"step": 870
},
{
"epoch": 2.2320862396956245,
"grad_norm": 0.5424672160350248,
"learning_rate": 5e-06,
"loss": 0.4099,
"step": 880
},
{
"epoch": 2.257450856055802,
"grad_norm": 0.5674446384978195,
"learning_rate": 5e-06,
"loss": 0.416,
"step": 890
},
{
"epoch": 2.2828154724159795,
"grad_norm": 0.5128282183689237,
"learning_rate": 5e-06,
"loss": 0.4157,
"step": 900
},
{
"epoch": 2.308180088776157,
"grad_norm": 0.5135015935006935,
"learning_rate": 5e-06,
"loss": 0.4172,
"step": 910
},
{
"epoch": 2.333544705136335,
"grad_norm": 0.596189153928778,
"learning_rate": 5e-06,
"loss": 0.4152,
"step": 920
},
{
"epoch": 2.3589093214965122,
"grad_norm": 0.5352826549369347,
"learning_rate": 5e-06,
"loss": 0.4149,
"step": 930
},
{
"epoch": 2.38427393785669,
"grad_norm": 0.5014349895803593,
"learning_rate": 5e-06,
"loss": 0.4141,
"step": 940
},
{
"epoch": 2.4096385542168672,
"grad_norm": 0.4805462505254729,
"learning_rate": 5e-06,
"loss": 0.4176,
"step": 950
},
{
"epoch": 2.435003170577045,
"grad_norm": 0.6137290218711765,
"learning_rate": 5e-06,
"loss": 0.419,
"step": 960
},
{
"epoch": 2.4603677869372227,
"grad_norm": 0.5732682054062723,
"learning_rate": 5e-06,
"loss": 0.4163,
"step": 970
},
{
"epoch": 2.4857324032974,
"grad_norm": 0.4771910554061346,
"learning_rate": 5e-06,
"loss": 0.4134,
"step": 980
},
{
"epoch": 2.5110970196575777,
"grad_norm": 0.476330897847943,
"learning_rate": 5e-06,
"loss": 0.4225,
"step": 990
},
{
"epoch": 2.536461636017755,
"grad_norm": 0.47973764991876255,
"learning_rate": 5e-06,
"loss": 0.4145,
"step": 1000
},
{
"epoch": 2.5618262523779327,
"grad_norm": 0.5939904213084772,
"learning_rate": 5e-06,
"loss": 0.4153,
"step": 1010
},
{
"epoch": 2.5871908687381104,
"grad_norm": 0.5936679428712734,
"learning_rate": 5e-06,
"loss": 0.4204,
"step": 1020
},
{
"epoch": 2.6125554850982877,
"grad_norm": 0.5188426106745951,
"learning_rate": 5e-06,
"loss": 0.4183,
"step": 1030
},
{
"epoch": 2.6379201014584654,
"grad_norm": 0.5644339619977095,
"learning_rate": 5e-06,
"loss": 0.4126,
"step": 1040
},
{
"epoch": 2.6632847178186427,
"grad_norm": 0.6020266606747191,
"learning_rate": 5e-06,
"loss": 0.4186,
"step": 1050
},
{
"epoch": 2.6886493341788205,
"grad_norm": 0.4752185053914476,
"learning_rate": 5e-06,
"loss": 0.4138,
"step": 1060
},
{
"epoch": 2.714013950538998,
"grad_norm": 0.7626568079783347,
"learning_rate": 5e-06,
"loss": 0.4135,
"step": 1070
},
{
"epoch": 2.739378566899176,
"grad_norm": 0.5108017704950135,
"learning_rate": 5e-06,
"loss": 0.4154,
"step": 1080
},
{
"epoch": 2.764743183259353,
"grad_norm": 0.5746749293115092,
"learning_rate": 5e-06,
"loss": 0.4173,
"step": 1090
},
{
"epoch": 2.7901077996195305,
"grad_norm": 0.5467822052037948,
"learning_rate": 5e-06,
"loss": 0.4166,
"step": 1100
},
{
"epoch": 2.815472415979708,
"grad_norm": 0.6357622704499519,
"learning_rate": 5e-06,
"loss": 0.4198,
"step": 1110
},
{
"epoch": 2.840837032339886,
"grad_norm": 0.7346508445377833,
"learning_rate": 5e-06,
"loss": 0.4161,
"step": 1120
},
{
"epoch": 2.8662016487000637,
"grad_norm": 0.4767595766550471,
"learning_rate": 5e-06,
"loss": 0.4136,
"step": 1130
},
{
"epoch": 2.891566265060241,
"grad_norm": 0.5450967603642648,
"learning_rate": 5e-06,
"loss": 0.416,
"step": 1140
},
{
"epoch": 2.9169308814204187,
"grad_norm": 0.6310631600995659,
"learning_rate": 5e-06,
"loss": 0.4156,
"step": 1150
},
{
"epoch": 2.942295497780596,
"grad_norm": 0.4875236135766479,
"learning_rate": 5e-06,
"loss": 0.4135,
"step": 1160
},
{
"epoch": 2.9676601141407737,
"grad_norm": 0.5024341899279373,
"learning_rate": 5e-06,
"loss": 0.4185,
"step": 1170
},
{
"epoch": 2.9930247305009514,
"grad_norm": 0.4812185425989623,
"learning_rate": 5e-06,
"loss": 0.422,
"step": 1180
},
{
"epoch": 2.9980976537729864,
"eval_loss": 0.4713599979877472,
"eval_runtime": 132.9609,
"eval_samples_per_second": 79.866,
"eval_steps_per_second": 0.624,
"step": 1182
},
{
"epoch": 2.9980976537729864,
"step": 1182,
"total_flos": 1979475264798720.0,
"train_loss": 0.4632013658985067,
"train_runtime": 20039.4082,
"train_samples_per_second": 30.202,
"train_steps_per_second": 0.059
}
],
"logging_steps": 10,
"max_steps": 1182,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1979475264798720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}