|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 825, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03636363636363636, |
|
"grad_norm": 3.2293971017711174, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0337, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07272727272727272, |
|
"grad_norm": 1.2461654883314972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9092, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10909090909090909, |
|
"grad_norm": 1.0937923635217501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8658, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.14545454545454545, |
|
"grad_norm": 1.3350225945199414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.844, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 1.029425810987488, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8249, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.21818181818181817, |
|
"grad_norm": 1.5219290967515304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8068, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2545454545454545, |
|
"grad_norm": 1.6740495880819521, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7989, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2909090909090909, |
|
"grad_norm": 1.2973735477904815, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7921, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.32727272727272727, |
|
"grad_norm": 0.8566363002967183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.781, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.9961672641644985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7745, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8927257959121373, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7754, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.43636363636363634, |
|
"grad_norm": 0.6135178704985191, |
|
"learning_rate": 5e-06, |
|
"loss": 0.772, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4727272727272727, |
|
"grad_norm": 0.7431505188106242, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7686, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.509090909090909, |
|
"grad_norm": 0.7150787812569424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7618, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 0.6352342662453642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7611, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5818181818181818, |
|
"grad_norm": 0.6257901300873526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7569, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6181818181818182, |
|
"grad_norm": 0.6387102446786417, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7611, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6545454545454545, |
|
"grad_norm": 0.5983754152683597, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7546, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6909090909090909, |
|
"grad_norm": 0.7480127979666656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7566, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.5804396007389026, |
|
"learning_rate": 5e-06, |
|
"loss": 0.75, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7636363636363637, |
|
"grad_norm": 0.682148918886327, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7476, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.8039336411015884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7462, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8363636363636363, |
|
"grad_norm": 0.6876607052536684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7411, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8727272727272727, |
|
"grad_norm": 0.6588151842699974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7469, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.6715213794720472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7378, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9454545454545454, |
|
"grad_norm": 0.5870957383826958, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7457, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9818181818181818, |
|
"grad_norm": 0.6643986810785624, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7466, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.7417545914649963, |
|
"eval_runtime": 26.6218, |
|
"eval_samples_per_second": 278.268, |
|
"eval_steps_per_second": 1.089, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.018181818181818, |
|
"grad_norm": 0.9781239153342394, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7136, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0545454545454545, |
|
"grad_norm": 0.7152925984087143, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6871, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 0.6929492576277494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6894, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.1272727272727272, |
|
"grad_norm": 0.728764264622129, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6935, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1636363636363636, |
|
"grad_norm": 0.7252517543389313, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6945, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.6665160391388197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6886, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2363636363636363, |
|
"grad_norm": 0.7161659905517039, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6898, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 0.5719039452566653, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6934, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.309090909090909, |
|
"grad_norm": 0.6060853746189843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6922, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3454545454545455, |
|
"grad_norm": 0.6563719933283224, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6912, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3818181818181818, |
|
"grad_norm": 0.5958006047997326, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6904, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.4181818181818182, |
|
"grad_norm": 0.7430218105320606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.688, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 0.6322073230662588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6883, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.490909090909091, |
|
"grad_norm": 0.7151221978666452, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6934, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5272727272727273, |
|
"grad_norm": 0.6184168187218901, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6916, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5636363636363635, |
|
"grad_norm": 0.6280848540221795, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6916, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6568705155050817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6856, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.6359258851827682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6851, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.6727272727272728, |
|
"grad_norm": 0.710888538426671, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6872, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.709090909090909, |
|
"grad_norm": 0.7584066029266229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6849, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7454545454545456, |
|
"grad_norm": 0.5960492892442344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6891, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.7818181818181817, |
|
"grad_norm": 0.5629377755020811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6847, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.589716689792314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6871, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8545454545454545, |
|
"grad_norm": 0.5740509121739076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6888, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8909090909090909, |
|
"grad_norm": 0.5891046247600111, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6884, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.9272727272727272, |
|
"grad_norm": 0.6447276827053491, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6893, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9636363636363636, |
|
"grad_norm": 0.6935516132206995, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6868, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.5781509823001448, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6841, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7281343340873718, |
|
"eval_runtime": 26.4698, |
|
"eval_samples_per_second": 279.867, |
|
"eval_steps_per_second": 1.096, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.036363636363636, |
|
"grad_norm": 0.7551729949207574, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6351, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.0727272727272728, |
|
"grad_norm": 0.6070448901420726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6307, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.109090909090909, |
|
"grad_norm": 0.7225948313371118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6357, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.1454545454545455, |
|
"grad_norm": 1.3944109200671733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6375, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 1.1390572133302885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.635, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.2181818181818183, |
|
"grad_norm": 0.7900509422330505, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6383, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.2545454545454544, |
|
"grad_norm": 0.594871030626621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6321, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.290909090909091, |
|
"grad_norm": 0.665898906007086, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6341, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.327272727272727, |
|
"grad_norm": 0.6509722897169726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6326, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.6231670817005929, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6385, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.6425410588561774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6373, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.4363636363636365, |
|
"grad_norm": 0.621241338432262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6399, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.4727272727272727, |
|
"grad_norm": 0.6924233110335524, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6393, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.509090909090909, |
|
"grad_norm": 0.6419114963815122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6405, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.7336852368102121, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6385, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.581818181818182, |
|
"grad_norm": 0.7922288944252411, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6377, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.618181818181818, |
|
"grad_norm": 0.6500377491351792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6427, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.6545454545454543, |
|
"grad_norm": 0.6853834065254241, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6346, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.690909090909091, |
|
"grad_norm": 0.8156333668312422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6414, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.6294215183471213, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6363, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.7636363636363637, |
|
"grad_norm": 0.8237171162592375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6421, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.6772752476166749, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6356, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.8363636363636364, |
|
"grad_norm": 0.7780500988065099, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6425, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.8727272727272726, |
|
"grad_norm": 0.6862874007983163, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6368, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.5748210856771035, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6405, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.9454545454545453, |
|
"grad_norm": 0.6351457621560951, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6357, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.981818181818182, |
|
"grad_norm": 0.586627253325874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6412, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.7299705147743225, |
|
"eval_runtime": 25.9228, |
|
"eval_samples_per_second": 285.772, |
|
"eval_steps_per_second": 1.119, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 825, |
|
"total_flos": 1381905727488000.0, |
|
"train_loss": 0.7044297796307188, |
|
"train_runtime": 5353.5806, |
|
"train_samples_per_second": 78.866, |
|
"train_steps_per_second": 0.154 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1381905727488000.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|