|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 2, |
|
"global_step": 56, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 268.4626770019531, |
|
"learning_rate": 0.0005621725453848409, |
|
"loss": 4.6194, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"eval_loss": 4.2445478439331055, |
|
"eval_runtime": 108.1522, |
|
"eval_samples_per_second": 3.236, |
|
"eval_steps_per_second": 0.055, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 71.9693374633789, |
|
"learning_rate": 0.0005413513400002172, |
|
"loss": 6.5579, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"eval_loss": 4.125729560852051, |
|
"eval_runtime": 146.7071, |
|
"eval_samples_per_second": 2.386, |
|
"eval_steps_per_second": 0.041, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 5.648405075073242, |
|
"learning_rate": 0.0005205301346155935, |
|
"loss": 4.9826, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"eval_loss": 5.672520160675049, |
|
"eval_runtime": 142.0721, |
|
"eval_samples_per_second": 2.464, |
|
"eval_steps_per_second": 0.042, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 6.27989387512207, |
|
"learning_rate": 0.0004997089292309697, |
|
"loss": 5.9002, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"eval_loss": 4.104394912719727, |
|
"eval_runtime": 135.6009, |
|
"eval_samples_per_second": 2.581, |
|
"eval_steps_per_second": 0.044, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.1307026743888855, |
|
"learning_rate": 0.000478887723846346, |
|
"loss": 4.8597, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"eval_loss": 4.095502853393555, |
|
"eval_runtime": 111.3151, |
|
"eval_samples_per_second": 3.144, |
|
"eval_steps_per_second": 0.054, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.01885412447154522, |
|
"learning_rate": 0.00045806651846172226, |
|
"loss": 4.8531, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"eval_loss": 4.094283580780029, |
|
"eval_runtime": 109.2586, |
|
"eval_samples_per_second": 3.203, |
|
"eval_steps_per_second": 0.055, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.010488088242709637, |
|
"learning_rate": 0.0004372453130770985, |
|
"loss": 4.8523, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 4.094061851501465, |
|
"eval_runtime": 137.8714, |
|
"eval_samples_per_second": 2.539, |
|
"eval_steps_per_second": 0.044, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.006233076564967632, |
|
"learning_rate": 0.00041642410769247477, |
|
"loss": 4.8521, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"eval_loss": 4.093998432159424, |
|
"eval_runtime": 129.4726, |
|
"eval_samples_per_second": 2.703, |
|
"eval_steps_per_second": 0.046, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.02763601951301098, |
|
"learning_rate": 0.00039560290230785105, |
|
"loss": 4.8521, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"eval_loss": 4.09397554397583, |
|
"eval_runtime": 109.2369, |
|
"eval_samples_per_second": 3.204, |
|
"eval_steps_per_second": 0.055, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.005364938639104366, |
|
"learning_rate": 0.00037478169692322733, |
|
"loss": 4.8521, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"eval_loss": 4.093966960906982, |
|
"eval_runtime": 129.3687, |
|
"eval_samples_per_second": 2.705, |
|
"eval_steps_per_second": 0.046, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 0.0038224116433411837, |
|
"learning_rate": 0.00035396049153860355, |
|
"loss": 4.8521, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"eval_loss": 4.093982219696045, |
|
"eval_runtime": 110.6155, |
|
"eval_samples_per_second": 3.164, |
|
"eval_steps_per_second": 0.054, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.005476065911352634, |
|
"learning_rate": 0.0003331392861539798, |
|
"loss": 4.8521, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"eval_loss": 4.093994140625, |
|
"eval_runtime": 108.8427, |
|
"eval_samples_per_second": 3.216, |
|
"eval_steps_per_second": 0.055, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.00341013353317976, |
|
"learning_rate": 0.00031231808076935606, |
|
"loss": 4.8521, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"eval_loss": 4.093983173370361, |
|
"eval_runtime": 109.5893, |
|
"eval_samples_per_second": 3.194, |
|
"eval_steps_per_second": 0.055, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.004943197127431631, |
|
"learning_rate": 0.00029149687538473234, |
|
"loss": 4.3067, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 4.093968868255615, |
|
"eval_runtime": 109.4688, |
|
"eval_samples_per_second": 3.197, |
|
"eval_steps_per_second": 0.055, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 0.002467899350449443, |
|
"learning_rate": 0.0002706756700001086, |
|
"loss": 4.8521, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"eval_loss": 4.093968868255615, |
|
"eval_runtime": 108.5053, |
|
"eval_samples_per_second": 3.226, |
|
"eval_steps_per_second": 0.055, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.004230957478284836, |
|
"learning_rate": 0.00024985446461548485, |
|
"loss": 4.8521, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"eval_loss": 4.093967914581299, |
|
"eval_runtime": 121.2409, |
|
"eval_samples_per_second": 2.887, |
|
"eval_steps_per_second": 0.049, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"grad_norm": 0.0021761604584753513, |
|
"learning_rate": 0.00022903325923086113, |
|
"loss": 4.8521, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.2142857142857142, |
|
"eval_loss": 4.093955993652344, |
|
"eval_runtime": 150.1791, |
|
"eval_samples_per_second": 2.331, |
|
"eval_steps_per_second": 0.04, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.0025435429997742176, |
|
"learning_rate": 0.00020821205384623738, |
|
"loss": 4.8521, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.2857142857142856, |
|
"eval_loss": 4.093955039978027, |
|
"eval_runtime": 138.5452, |
|
"eval_samples_per_second": 2.526, |
|
"eval_steps_per_second": 0.043, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"grad_norm": 0.0025120778009295464, |
|
"learning_rate": 0.00018739084846161366, |
|
"loss": 4.852, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.3571428571428572, |
|
"eval_loss": 4.093952178955078, |
|
"eval_runtime": 159.7134, |
|
"eval_samples_per_second": 2.191, |
|
"eval_steps_per_second": 0.038, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.0023238961584866047, |
|
"learning_rate": 0.0001665696430769899, |
|
"loss": 4.852, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"eval_loss": 4.093949317932129, |
|
"eval_runtime": 143.6861, |
|
"eval_samples_per_second": 2.436, |
|
"eval_steps_per_second": 0.042, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.0012704160762950778, |
|
"learning_rate": 0.00014574843769236617, |
|
"loss": 4.852, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 4.0939459800720215, |
|
"eval_runtime": 122.8463, |
|
"eval_samples_per_second": 2.849, |
|
"eval_steps_per_second": 0.049, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.0016051293350756168, |
|
"learning_rate": 0.00012492723230774242, |
|
"loss": 4.852, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"eval_loss": 4.093944549560547, |
|
"eval_runtime": 107.8466, |
|
"eval_samples_per_second": 3.245, |
|
"eval_steps_per_second": 0.056, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"grad_norm": 0.0015636960742995143, |
|
"learning_rate": 0.00010410602692311869, |
|
"loss": 4.852, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.6428571428571428, |
|
"eval_loss": 4.093943119049072, |
|
"eval_runtime": 129.3014, |
|
"eval_samples_per_second": 2.707, |
|
"eval_steps_per_second": 0.046, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.001410445780493319, |
|
"learning_rate": 8.328482153849495e-05, |
|
"loss": 4.8521, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"eval_loss": 4.093941688537598, |
|
"eval_runtime": 109.6222, |
|
"eval_samples_per_second": 3.193, |
|
"eval_steps_per_second": 0.055, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 0.001393631100654602, |
|
"learning_rate": 6.246361615387121e-05, |
|
"loss": 4.852, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"eval_loss": 4.093940734863281, |
|
"eval_runtime": 122.8998, |
|
"eval_samples_per_second": 2.848, |
|
"eval_steps_per_second": 0.049, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"grad_norm": 0.0009566438966430724, |
|
"learning_rate": 4.164241076924747e-05, |
|
"loss": 4.852, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.8571428571428572, |
|
"eval_loss": 4.093940258026123, |
|
"eval_runtime": 134.9009, |
|
"eval_samples_per_second": 2.594, |
|
"eval_steps_per_second": 0.044, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"grad_norm": 0.0013086905237287283, |
|
"learning_rate": 2.0821205384623736e-05, |
|
"loss": 4.852, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.9285714285714286, |
|
"eval_loss": 4.093940258026123, |
|
"eval_runtime": 126.2369, |
|
"eval_samples_per_second": 2.773, |
|
"eval_steps_per_second": 0.048, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0012161381309852004, |
|
"learning_rate": 0.0, |
|
"loss": 4.3066, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 4.093940258026123, |
|
"eval_runtime": 110.6877, |
|
"eval_samples_per_second": 3.162, |
|
"eval_steps_per_second": 0.054, |
|
"step": 56 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 56, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 407052651766068.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": { |
|
"_wandb": {}, |
|
"assignments": {}, |
|
"decay": 0.01, |
|
"learning_rate": 0.0005829937507694647, |
|
"metric": "eval/loss", |
|
"per_device_train_batch_size": 128 |
|
} |
|
} |
|
|