|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 5374, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018608113137327874, |
|
"grad_norm": 0.15951524674892426, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 2.4521, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03721622627465575, |
|
"grad_norm": 0.10372528433799744, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.4034, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05582433941198362, |
|
"grad_norm": 0.20789675414562225, |
|
"learning_rate": 6e-06, |
|
"loss": 2.4476, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0744324525493115, |
|
"grad_norm": 0.2809026539325714, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 2.4063, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09304056568663938, |
|
"grad_norm": 0.3346979320049286, |
|
"learning_rate": 1e-05, |
|
"loss": 2.3631, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.11164867882396724, |
|
"grad_norm": 0.42336997389793396, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.3173, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.13025679196129514, |
|
"grad_norm": 0.4591374099254608, |
|
"learning_rate": 1.4e-05, |
|
"loss": 2.2564, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.148864905098623, |
|
"grad_norm": 0.5659797787666321, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.273, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.16747301823595087, |
|
"grad_norm": 0.5892286896705627, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.2587, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.18608113137327875, |
|
"grad_norm": 0.5323100686073303, |
|
"learning_rate": 2e-05, |
|
"loss": 2.1991, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20468924451060663, |
|
"grad_norm": 0.6430222988128662, |
|
"learning_rate": 1.9974217472841614e-05, |
|
"loss": 2.1777, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.22329735764793449, |
|
"grad_norm": 0.6999238133430481, |
|
"learning_rate": 1.9897002839107783e-05, |
|
"loss": 2.1297, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.24190547078526237, |
|
"grad_norm": 0.8810726404190063, |
|
"learning_rate": 1.9768754256476755e-05, |
|
"loss": 2.17, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.2605135839225903, |
|
"grad_norm": 0.8605418801307678, |
|
"learning_rate": 1.9590133039461487e-05, |
|
"loss": 2.13, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.27912169705991813, |
|
"grad_norm": 0.711585521697998, |
|
"learning_rate": 1.936206024933772e-05, |
|
"loss": 2.162, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.297729810197246, |
|
"grad_norm": 0.8360315561294556, |
|
"learning_rate": 1.908571194468655e-05, |
|
"loss": 2.165, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3163379233345739, |
|
"grad_norm": 0.9491225481033325, |
|
"learning_rate": 1.8762513117041943e-05, |
|
"loss": 2.0905, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.33494603647190174, |
|
"grad_norm": 0.6673754453659058, |
|
"learning_rate": 1.839413034291416e-05, |
|
"loss": 2.1022, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.35355414960922965, |
|
"grad_norm": 0.8447780013084412, |
|
"learning_rate": 1.7982463190078928e-05, |
|
"loss": 2.1088, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.3721622627465575, |
|
"grad_norm": 0.758438229560852, |
|
"learning_rate": 1.752963442244589e-05, |
|
"loss": 2.1265, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.39077037588388536, |
|
"grad_norm": 1.1905587911605835, |
|
"learning_rate": 1.703797905401496e-05, |
|
"loss": 2.0621, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.40937848902121327, |
|
"grad_norm": 0.8596579432487488, |
|
"learning_rate": 1.6510032308363964e-05, |
|
"loss": 2.0768, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4279866021585411, |
|
"grad_norm": 0.8008110523223877, |
|
"learning_rate": 1.5948516545754497e-05, |
|
"loss": 2.134, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.44659471529586897, |
|
"grad_norm": 0.7902112007141113, |
|
"learning_rate": 1.5356327225266418e-05, |
|
"loss": 2.0793, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4652028284331969, |
|
"grad_norm": 1.021168828010559, |
|
"learning_rate": 1.4736517974347408e-05, |
|
"loss": 2.0917, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.48381094157052473, |
|
"grad_norm": 1.0414773225784302, |
|
"learning_rate": 1.4092284842766439e-05, |
|
"loss": 2.0306, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.5024190547078526, |
|
"grad_norm": 0.8652853965759277, |
|
"learning_rate": 1.3426949822165768e-05, |
|
"loss": 2.0742, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.5210271678451805, |
|
"grad_norm": 0.8237012624740601, |
|
"learning_rate": 1.2743943716193017e-05, |
|
"loss": 2.0428, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5396352809825083, |
|
"grad_norm": 1.091333270072937, |
|
"learning_rate": 1.2046788449543496e-05, |
|
"loss": 2.0632, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.5582433941198363, |
|
"grad_norm": 0.9181482791900635, |
|
"learning_rate": 1.1339078907136409e-05, |
|
"loss": 1.9869, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5768515072571642, |
|
"grad_norm": 1.1163825988769531, |
|
"learning_rate": 1.0624464397071229e-05, |
|
"loss": 2.0724, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.595459620394492, |
|
"grad_norm": 1.086317539215088, |
|
"learning_rate": 9.906629832950659e-06, |
|
"loss": 2.0576, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.6140677335318199, |
|
"grad_norm": 0.822471559047699, |
|
"learning_rate": 9.189276732603637e-06, |
|
"loss": 2.1059, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.6326758466691478, |
|
"grad_norm": 0.9485145807266235, |
|
"learning_rate": 8.47610413118853e-06, |
|
"loss": 2.0566, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6512839598064756, |
|
"grad_norm": 0.9303804039955139, |
|
"learning_rate": 7.770789507098263e-06, |
|
"loss": 2.0242, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6698920729438035, |
|
"grad_norm": 1.1080023050308228, |
|
"learning_rate": 7.076969819022999e-06, |
|
"loss": 1.9736, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6885001860811314, |
|
"grad_norm": 1.0893449783325195, |
|
"learning_rate": 6.3982227519528986e-06, |
|
"loss": 2.0214, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.7071082992184593, |
|
"grad_norm": 0.9729887843132019, |
|
"learning_rate": 5.738048268826046e-06, |
|
"loss": 2.0238, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.7257164123557871, |
|
"grad_norm": 0.7429510951042175, |
|
"learning_rate": 5.099850562950539e-06, |
|
"loss": 2.0608, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.744324525493115, |
|
"grad_norm": 1.201204538345337, |
|
"learning_rate": 4.486920504263212e-06, |
|
"loss": 1.9825, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7629326386304429, |
|
"grad_norm": 0.9092488884925842, |
|
"learning_rate": 3.902418669940925e-06, |
|
"loss": 2.0295, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.7815407517677707, |
|
"grad_norm": 1.1268125772476196, |
|
"learning_rate": 3.3493590468671868e-06, |
|
"loss": 2.029, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.8001488649050986, |
|
"grad_norm": 0.817008376121521, |
|
"learning_rate": 2.8305934899924135e-06, |
|
"loss": 2.0129, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.8187569780424265, |
|
"grad_norm": 0.9579901099205017, |
|
"learning_rate": 2.348797016728398e-06, |
|
"loss": 2.0472, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.8373650911797543, |
|
"grad_norm": 0.8800289034843445, |
|
"learning_rate": 1.9064540132064946e-06, |
|
"loss": 1.959, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8559732043170822, |
|
"grad_norm": 0.9605334997177124, |
|
"learning_rate": 1.505845423527027e-06, |
|
"loss": 1.9486, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8745813174544101, |
|
"grad_norm": 0.7564200162887573, |
|
"learning_rate": 1.1490369880586516e-06, |
|
"loss": 2.0324, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.8931894305917379, |
|
"grad_norm": 1.0660285949707031, |
|
"learning_rate": 8.378685914369323e-07, |
|
"loss": 2.0176, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.9117975437290659, |
|
"grad_norm": 1.1286784410476685, |
|
"learning_rate": 5.739447751892135e-07, |
|
"loss": 2.0409, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.9304056568663938, |
|
"grad_norm": 0.9878057241439819, |
|
"learning_rate": 3.586264639075265e-07, |
|
"loss": 1.967, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9490137700037217, |
|
"grad_norm": 0.9573999643325806, |
|
"learning_rate": 1.9302394763353606e-07, |
|
"loss": 2.0823, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.9676218831410495, |
|
"grad_norm": 0.7985095381736755, |
|
"learning_rate": 7.79911566419056e-08, |
|
"loss": 2.0383, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9862299962783774, |
|
"grad_norm": 1.0881154537200928, |
|
"learning_rate": 1.4121258144207395e-08, |
|
"loss": 2.0119, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 5374, |
|
"total_flos": 9.788605661184e+16, |
|
"train_loss": 2.1113919824927456, |
|
"train_runtime": 862.752, |
|
"train_samples_per_second": 12.458, |
|
"train_steps_per_second": 6.229 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5374, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.788605661184e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|