|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 24435, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.061387354205033766, |
|
"grad_norm": 4.773414134979248, |
|
"learning_rate": 2.9386126457949665e-05, |
|
"loss": 0.7901, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12277470841006753, |
|
"grad_norm": 5.613208293914795, |
|
"learning_rate": 2.8772252915899326e-05, |
|
"loss": 0.686, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1841620626151013, |
|
"grad_norm": 6.7994384765625, |
|
"learning_rate": 2.815837937384899e-05, |
|
"loss": 0.6822, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24554941682013506, |
|
"grad_norm": 4.575761318206787, |
|
"learning_rate": 2.7544505831798647e-05, |
|
"loss": 0.6375, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3069367710251688, |
|
"grad_norm": 5.669924736022949, |
|
"learning_rate": 2.693063228974831e-05, |
|
"loss": 0.6488, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3683241252302026, |
|
"grad_norm": 5.164685249328613, |
|
"learning_rate": 2.6316758747697975e-05, |
|
"loss": 0.6232, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.42971147943523635, |
|
"grad_norm": 3.6371734142303467, |
|
"learning_rate": 2.5702885205647636e-05, |
|
"loss": 0.6478, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4910988336402701, |
|
"grad_norm": 13.574466705322266, |
|
"learning_rate": 2.50890116635973e-05, |
|
"loss": 0.6271, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5524861878453039, |
|
"grad_norm": 3.65866756439209, |
|
"learning_rate": 2.4475138121546964e-05, |
|
"loss": 0.6212, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.6138735420503376, |
|
"grad_norm": 7.22385835647583, |
|
"learning_rate": 2.3861264579496625e-05, |
|
"loss": 0.6126, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6752608962553714, |
|
"grad_norm": 4.810811996459961, |
|
"learning_rate": 2.3247391037446286e-05, |
|
"loss": 0.6095, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.7366482504604052, |
|
"grad_norm": 4.843252182006836, |
|
"learning_rate": 2.2633517495395946e-05, |
|
"loss": 0.5984, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7980356046654389, |
|
"grad_norm": 3.590651750564575, |
|
"learning_rate": 2.201964395334561e-05, |
|
"loss": 0.6056, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.8594229588704727, |
|
"grad_norm": 6.5436177253723145, |
|
"learning_rate": 2.1405770411295274e-05, |
|
"loss": 0.6149, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.9208103130755064, |
|
"grad_norm": 3.6206412315368652, |
|
"learning_rate": 2.0791896869244935e-05, |
|
"loss": 0.6191, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.9821976672805403, |
|
"grad_norm": 3.8409266471862793, |
|
"learning_rate": 2.01780233271946e-05, |
|
"loss": 0.5964, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.043585021485574, |
|
"grad_norm": 3.4344983100891113, |
|
"learning_rate": 1.9564149785144263e-05, |
|
"loss": 0.5774, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.1049723756906078, |
|
"grad_norm": 6.731381893157959, |
|
"learning_rate": 1.8950276243093924e-05, |
|
"loss": 0.5958, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.1663597298956414, |
|
"grad_norm": 3.6475465297698975, |
|
"learning_rate": 1.8336402701043585e-05, |
|
"loss": 0.5723, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.2277470841006752, |
|
"grad_norm": 5.497737884521484, |
|
"learning_rate": 1.7722529158993245e-05, |
|
"loss": 0.5592, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.289134438305709, |
|
"grad_norm": 3.4102916717529297, |
|
"learning_rate": 1.710865561694291e-05, |
|
"loss": 0.563, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.350521792510743, |
|
"grad_norm": 2.2644035816192627, |
|
"learning_rate": 1.6494782074892574e-05, |
|
"loss": 0.5677, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.4119091467157765, |
|
"grad_norm": 4.101683139801025, |
|
"learning_rate": 1.5880908532842234e-05, |
|
"loss": 0.5714, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.4732965009208103, |
|
"grad_norm": 3.3737809658050537, |
|
"learning_rate": 1.52670349907919e-05, |
|
"loss": 0.5716, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.5346838551258442, |
|
"grad_norm": 4.371752738952637, |
|
"learning_rate": 1.465316144874156e-05, |
|
"loss": 0.5706, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.5960712093308778, |
|
"grad_norm": 2.752619504928589, |
|
"learning_rate": 1.4039287906691222e-05, |
|
"loss": 0.576, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.6574585635359116, |
|
"grad_norm": 5.145851135253906, |
|
"learning_rate": 1.3425414364640886e-05, |
|
"loss": 0.5532, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.7188459177409454, |
|
"grad_norm": 3.320183515548706, |
|
"learning_rate": 1.2811540822590546e-05, |
|
"loss": 0.5496, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.780233271945979, |
|
"grad_norm": 8.330361366271973, |
|
"learning_rate": 1.2197667280540209e-05, |
|
"loss": 0.5785, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.8416206261510129, |
|
"grad_norm": 5.743785381317139, |
|
"learning_rate": 1.1583793738489871e-05, |
|
"loss": 0.5484, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.9030079803560467, |
|
"grad_norm": 3.6862549781799316, |
|
"learning_rate": 1.0969920196439534e-05, |
|
"loss": 0.5614, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.9643953345610803, |
|
"grad_norm": 3.0343778133392334, |
|
"learning_rate": 1.0356046654389196e-05, |
|
"loss": 0.5636, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.0257826887661143, |
|
"grad_norm": 3.191669225692749, |
|
"learning_rate": 9.742173112338858e-06, |
|
"loss": 0.5583, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.087170042971148, |
|
"grad_norm": 2.4944469928741455, |
|
"learning_rate": 9.128299570288521e-06, |
|
"loss": 0.5592, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.1485573971761815, |
|
"grad_norm": 5.961979389190674, |
|
"learning_rate": 8.514426028238183e-06, |
|
"loss": 0.543, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 2.2099447513812156, |
|
"grad_norm": 3.5605664253234863, |
|
"learning_rate": 7.900552486187846e-06, |
|
"loss": 0.5553, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.271332105586249, |
|
"grad_norm": 4.210664749145508, |
|
"learning_rate": 7.286678944137508e-06, |
|
"loss": 0.5449, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 2.332719459791283, |
|
"grad_norm": 3.2335548400878906, |
|
"learning_rate": 6.6728054020871705e-06, |
|
"loss": 0.5365, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.394106813996317, |
|
"grad_norm": 2.6599347591400146, |
|
"learning_rate": 6.058931860036833e-06, |
|
"loss": 0.5399, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.4554941682013505, |
|
"grad_norm": 4.371065616607666, |
|
"learning_rate": 5.445058317986495e-06, |
|
"loss": 0.5443, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.5168815224063845, |
|
"grad_norm": 4.86035680770874, |
|
"learning_rate": 4.831184775936157e-06, |
|
"loss": 0.5334, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.578268876611418, |
|
"grad_norm": 7.235117435455322, |
|
"learning_rate": 4.21731123388582e-06, |
|
"loss": 0.5513, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.6396562308164517, |
|
"grad_norm": 7.687514305114746, |
|
"learning_rate": 3.603437691835482e-06, |
|
"loss": 0.5367, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.701043585021486, |
|
"grad_norm": 3.5359065532684326, |
|
"learning_rate": 2.9895641497851445e-06, |
|
"loss": 0.5545, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.7624309392265194, |
|
"grad_norm": 3.8917315006256104, |
|
"learning_rate": 2.375690607734807e-06, |
|
"loss": 0.5401, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.823818293431553, |
|
"grad_norm": 3.8702080249786377, |
|
"learning_rate": 1.761817065684469e-06, |
|
"loss": 0.5424, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.885205647636587, |
|
"grad_norm": 5.246346473693848, |
|
"learning_rate": 1.1479435236341315e-06, |
|
"loss": 0.5495, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.9465930018416207, |
|
"grad_norm": 4.763291358947754, |
|
"learning_rate": 5.340699815837937e-07, |
|
"loss": 0.538, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 24435, |
|
"total_flos": 2.645641233904435e+16, |
|
"train_loss": 0.5831284902567558, |
|
"train_runtime": 2740.1657, |
|
"train_samples_per_second": 142.676, |
|
"train_steps_per_second": 8.917 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 24435, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.645641233904435e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|