|
{ |
|
"best_metric": 0.47900134325027466, |
|
"best_model_checkpoint": "Action_model/checkpoint-800", |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 1340, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.667541265487671, |
|
"learning_rate": 9.850746268656717e-05, |
|
"loss": 1.0751, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.206006050109863, |
|
"learning_rate": 9.701492537313434e-05, |
|
"loss": 0.8775, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 2.1778831481933594, |
|
"learning_rate": 9.552238805970149e-05, |
|
"loss": 0.7614, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.380641222000122, |
|
"learning_rate": 9.402985074626867e-05, |
|
"loss": 0.701, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.6440467834472656, |
|
"learning_rate": 9.253731343283582e-05, |
|
"loss": 0.7766, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_accuracy": 0.8224956063268892, |
|
"eval_loss": 0.6780304312705994, |
|
"eval_runtime": 8.1524, |
|
"eval_samples_per_second": 69.796, |
|
"eval_steps_per_second": 8.832, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.5913617610931396, |
|
"learning_rate": 9.104477611940299e-05, |
|
"loss": 0.8058, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 3.761014938354492, |
|
"learning_rate": 8.955223880597016e-05, |
|
"loss": 0.7853, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 2.7116057872772217, |
|
"learning_rate": 8.813432835820896e-05, |
|
"loss": 0.6584, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 4.109683990478516, |
|
"learning_rate": 8.664179104477612e-05, |
|
"loss": 0.6056, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 3.7420246601104736, |
|
"learning_rate": 8.514925373134329e-05, |
|
"loss": 0.61, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_accuracy": 0.8242530755711776, |
|
"eval_loss": 0.6279409527778625, |
|
"eval_runtime": 7.7566, |
|
"eval_samples_per_second": 73.357, |
|
"eval_steps_per_second": 9.282, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 4.6456379890441895, |
|
"learning_rate": 8.365671641791046e-05, |
|
"loss": 0.6016, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 3.174193859100342, |
|
"learning_rate": 8.216417910447761e-05, |
|
"loss": 0.5734, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.6657304763793945, |
|
"learning_rate": 8.067164179104479e-05, |
|
"loss": 0.5684, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 4.963318347930908, |
|
"learning_rate": 7.917910447761194e-05, |
|
"loss": 0.5957, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 5.692253589630127, |
|
"learning_rate": 7.776119402985074e-05, |
|
"loss": 0.4734, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_accuracy": 0.827768014059754, |
|
"eval_loss": 0.5593364834785461, |
|
"eval_runtime": 7.7448, |
|
"eval_samples_per_second": 73.468, |
|
"eval_steps_per_second": 9.297, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 3.6985843181610107, |
|
"learning_rate": 7.626865671641792e-05, |
|
"loss": 0.4646, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 3.7862443923950195, |
|
"learning_rate": 7.477611940298508e-05, |
|
"loss": 0.4568, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 4.695005893707275, |
|
"learning_rate": 7.328358208955224e-05, |
|
"loss": 0.4876, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 3.9707813262939453, |
|
"learning_rate": 7.179104477611941e-05, |
|
"loss": 0.4723, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 3.133072853088379, |
|
"learning_rate": 7.029850746268657e-05, |
|
"loss": 0.5275, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_accuracy": 0.8418277680140598, |
|
"eval_loss": 0.5148488879203796, |
|
"eval_runtime": 7.8181, |
|
"eval_samples_per_second": 72.78, |
|
"eval_steps_per_second": 9.209, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 3.1703858375549316, |
|
"learning_rate": 6.880597014925374e-05, |
|
"loss": 0.353, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 3.551959276199341, |
|
"learning_rate": 6.73134328358209e-05, |
|
"loss": 0.3559, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 3.8375322818756104, |
|
"learning_rate": 6.582089552238806e-05, |
|
"loss": 0.376, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 4.613718032836914, |
|
"learning_rate": 6.432835820895523e-05, |
|
"loss": 0.4183, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 9.122322082519531, |
|
"learning_rate": 6.283582089552239e-05, |
|
"loss": 0.3767, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"eval_accuracy": 0.843585237258348, |
|
"eval_loss": 0.5129419565200806, |
|
"eval_runtime": 7.7011, |
|
"eval_samples_per_second": 73.886, |
|
"eval_steps_per_second": 9.349, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.9917536973953247, |
|
"learning_rate": 6.134328358208955e-05, |
|
"loss": 0.3943, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 3.007828712463379, |
|
"learning_rate": 5.985074626865672e-05, |
|
"loss": 0.3885, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 6.075244426727295, |
|
"learning_rate": 5.835820895522388e-05, |
|
"loss": 0.3312, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 4.8264641761779785, |
|
"learning_rate": 5.686567164179105e-05, |
|
"loss": 0.3408, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 2.8609578609466553, |
|
"learning_rate": 5.537313432835821e-05, |
|
"loss": 0.3207, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_accuracy": 0.8558875219683656, |
|
"eval_loss": 0.4966126084327698, |
|
"eval_runtime": 7.6408, |
|
"eval_samples_per_second": 74.468, |
|
"eval_steps_per_second": 9.423, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 5.234914302825928, |
|
"learning_rate": 5.388059701492537e-05, |
|
"loss": 0.3306, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 4.566553592681885, |
|
"learning_rate": 5.238805970149254e-05, |
|
"loss": 0.3532, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 4.077399253845215, |
|
"learning_rate": 5.08955223880597e-05, |
|
"loss": 0.3501, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"grad_norm": 3.5527923107147217, |
|
"learning_rate": 4.940298507462687e-05, |
|
"loss": 0.3147, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"grad_norm": 4.944146633148193, |
|
"learning_rate": 4.7910447761194035e-05, |
|
"loss": 0.3155, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.22, |
|
"eval_accuracy": 0.8453427065026362, |
|
"eval_loss": 0.5251042246818542, |
|
"eval_runtime": 7.8327, |
|
"eval_samples_per_second": 72.644, |
|
"eval_steps_per_second": 9.192, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 2.9990365505218506, |
|
"learning_rate": 4.6417910447761195e-05, |
|
"loss": 0.3121, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 1.7021130323410034, |
|
"learning_rate": 4.492537313432836e-05, |
|
"loss": 0.3563, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 5.67, |
|
"grad_norm": 4.41218376159668, |
|
"learning_rate": 4.343283582089552e-05, |
|
"loss": 0.3447, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 2.955658197402954, |
|
"learning_rate": 4.194029850746269e-05, |
|
"loss": 0.2839, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 3.0852389335632324, |
|
"learning_rate": 4.044776119402985e-05, |
|
"loss": 0.2565, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"eval_accuracy": 0.8629173989455184, |
|
"eval_loss": 0.47900134325027466, |
|
"eval_runtime": 7.78, |
|
"eval_samples_per_second": 73.136, |
|
"eval_steps_per_second": 9.255, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 2.099174976348877, |
|
"learning_rate": 3.895522388059702e-05, |
|
"loss": 0.2818, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.27, |
|
"grad_norm": 3.712127685546875, |
|
"learning_rate": 3.746268656716418e-05, |
|
"loss": 0.2444, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.42, |
|
"grad_norm": 2.1818361282348633, |
|
"learning_rate": 3.5970149253731346e-05, |
|
"loss": 0.2418, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 6.57, |
|
"grad_norm": 2.211638927459717, |
|
"learning_rate": 3.447761194029851e-05, |
|
"loss": 0.2684, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 6.349803924560547, |
|
"learning_rate": 3.298507462686568e-05, |
|
"loss": 0.2791, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"eval_accuracy": 0.8523725834797891, |
|
"eval_loss": 0.5110830664634705, |
|
"eval_runtime": 7.7612, |
|
"eval_samples_per_second": 73.313, |
|
"eval_steps_per_second": 9.277, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 4.6033759117126465, |
|
"learning_rate": 3.149253731343284e-05, |
|
"loss": 0.2444, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 6.079771995544434, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2812, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.16, |
|
"grad_norm": 3.743011474609375, |
|
"learning_rate": 2.8507462686567167e-05, |
|
"loss": 0.183, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 4.840090751647949, |
|
"learning_rate": 2.701492537313433e-05, |
|
"loss": 0.2689, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 3.213412284851074, |
|
"learning_rate": 2.5522388059701496e-05, |
|
"loss": 0.1987, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"eval_accuracy": 0.8453427065026362, |
|
"eval_loss": 0.5002422332763672, |
|
"eval_runtime": 7.6836, |
|
"eval_samples_per_second": 74.054, |
|
"eval_steps_per_second": 9.371, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 7.61, |
|
"grad_norm": 2.2559454441070557, |
|
"learning_rate": 2.402985074626866e-05, |
|
"loss": 0.2254, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 7.76, |
|
"grad_norm": 4.895073413848877, |
|
"learning_rate": 2.2537313432835822e-05, |
|
"loss": 0.283, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 7.91, |
|
"grad_norm": 3.8669703006744385, |
|
"learning_rate": 2.1044776119402985e-05, |
|
"loss": 0.281, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 2.0825252532958984, |
|
"learning_rate": 1.9552238805970148e-05, |
|
"loss": 0.1955, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"grad_norm": 1.101592779159546, |
|
"learning_rate": 1.8059701492537314e-05, |
|
"loss": 0.2083, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"eval_accuracy": 0.8629173989455184, |
|
"eval_loss": 0.5034471154212952, |
|
"eval_runtime": 7.7486, |
|
"eval_samples_per_second": 73.432, |
|
"eval_steps_per_second": 9.292, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 2.6236236095428467, |
|
"learning_rate": 1.6567164179104477e-05, |
|
"loss": 0.1409, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 8.51, |
|
"grad_norm": 1.433937668800354, |
|
"learning_rate": 1.5074626865671642e-05, |
|
"loss": 0.2434, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 8.66, |
|
"grad_norm": 2.8774006366729736, |
|
"learning_rate": 1.3582089552238805e-05, |
|
"loss": 0.2044, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 8.81, |
|
"grad_norm": 4.404654026031494, |
|
"learning_rate": 1.2089552238805971e-05, |
|
"loss": 0.2153, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"grad_norm": 4.886945724487305, |
|
"learning_rate": 1.0597014925373134e-05, |
|
"loss": 0.2567, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 8.96, |
|
"eval_accuracy": 0.8576449912126538, |
|
"eval_loss": 0.4995073080062866, |
|
"eval_runtime": 7.7157, |
|
"eval_samples_per_second": 73.745, |
|
"eval_steps_per_second": 9.332, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 0.4216682016849518, |
|
"learning_rate": 9.104477611940299e-06, |
|
"loss": 0.203, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 4.639057159423828, |
|
"learning_rate": 7.611940298507463e-06, |
|
"loss": 0.1934, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 4.426870822906494, |
|
"learning_rate": 6.119402985074627e-06, |
|
"loss": 0.2067, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 2.948902130126953, |
|
"learning_rate": 4.626865671641791e-06, |
|
"loss": 0.2065, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"grad_norm": 2.3631768226623535, |
|
"learning_rate": 3.134328358208955e-06, |
|
"loss": 0.2127, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.7, |
|
"eval_accuracy": 0.8488576449912126, |
|
"eval_loss": 0.5034462809562683, |
|
"eval_runtime": 7.6386, |
|
"eval_samples_per_second": 74.49, |
|
"eval_steps_per_second": 9.426, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 9.85, |
|
"grad_norm": 2.909392833709717, |
|
"learning_rate": 1.6417910447761194e-06, |
|
"loss": 0.1547, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 2.5194036960601807, |
|
"learning_rate": 1.4925373134328358e-07, |
|
"loss": 0.2161, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1340, |
|
"total_flos": 3.3230947683690086e+18, |
|
"train_loss": 0.3864157530798841, |
|
"train_runtime": 1135.6162, |
|
"train_samples_per_second": 37.759, |
|
"train_steps_per_second": 1.18 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1340, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"total_flos": 3.3230947683690086e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|