|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 55077, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02723459883435917, |
|
"grad_norm": 1.1466608047485352, |
|
"learning_rate": 4.9558799498883386e-05, |
|
"loss": 1.2147, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.05446919766871834, |
|
"grad_norm": 3.79953932762146, |
|
"learning_rate": 4.910852079815531e-05, |
|
"loss": 0.4931, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0817037965030775, |
|
"grad_norm": 2.2004730701446533, |
|
"learning_rate": 4.8654610817582656e-05, |
|
"loss": 0.508, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.10893839533743668, |
|
"grad_norm": 5.668378829956055, |
|
"learning_rate": 4.820070083701001e-05, |
|
"loss": 0.7482, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.13617299417179585, |
|
"grad_norm": 2.5953574180603027, |
|
"learning_rate": 4.774679085643735e-05, |
|
"loss": 0.6969, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.163407593006155, |
|
"grad_norm": 1.506287932395935, |
|
"learning_rate": 4.72928808758647e-05, |
|
"loss": 0.501, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1906421918405142, |
|
"grad_norm": 1.627192735671997, |
|
"learning_rate": 4.683897089529205e-05, |
|
"loss": 0.5321, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.21787679067487337, |
|
"grad_norm": 1.1909410953521729, |
|
"learning_rate": 4.63850609147194e-05, |
|
"loss": 0.5486, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.24511138950923253, |
|
"grad_norm": 1.7642792463302612, |
|
"learning_rate": 4.5931150934146743e-05, |
|
"loss": 0.5043, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2723459883435917, |
|
"grad_norm": 2.2761762142181396, |
|
"learning_rate": 4.547724095357409e-05, |
|
"loss": 0.4645, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.2995805871779509, |
|
"grad_norm": 2.3419032096862793, |
|
"learning_rate": 4.502333097300144e-05, |
|
"loss": 0.4197, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.32681518601231, |
|
"grad_norm": 3.043858051300049, |
|
"learning_rate": 4.4569420992428784e-05, |
|
"loss": 0.4128, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.3540497848466692, |
|
"grad_norm": 10.96549129486084, |
|
"learning_rate": 4.411551101185613e-05, |
|
"loss": 0.4257, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.3812843836810284, |
|
"grad_norm": 2.053966760635376, |
|
"learning_rate": 4.366160103128348e-05, |
|
"loss": 0.3825, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.40851898251538754, |
|
"grad_norm": 2.4897701740264893, |
|
"learning_rate": 4.3207691050710824e-05, |
|
"loss": 0.3699, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.43575358134974673, |
|
"grad_norm": 0.603682816028595, |
|
"learning_rate": 4.275378107013817e-05, |
|
"loss": 0.4291, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.46298818018410587, |
|
"grad_norm": 0.8895764350891113, |
|
"learning_rate": 4.229987108956552e-05, |
|
"loss": 0.3847, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.49022277901846506, |
|
"grad_norm": 0.43345028162002563, |
|
"learning_rate": 4.1845961108992865e-05, |
|
"loss": 0.3642, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5174573778528242, |
|
"grad_norm": 1.6731306314468384, |
|
"learning_rate": 4.139205112842021e-05, |
|
"loss": 0.3751, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.5446919766871834, |
|
"grad_norm": 1.6484122276306152, |
|
"learning_rate": 4.093814114784756e-05, |
|
"loss": 0.4035, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.5719265755215426, |
|
"grad_norm": 1.7121918201446533, |
|
"learning_rate": 4.0484231167274905e-05, |
|
"loss": 0.3796, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.5991611743559018, |
|
"grad_norm": 2.6948976516723633, |
|
"learning_rate": 4.0030321186702256e-05, |
|
"loss": 0.372, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.626395773190261, |
|
"grad_norm": 3.9049389362335205, |
|
"learning_rate": 3.957641120612961e-05, |
|
"loss": 0.3455, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.65363037202462, |
|
"grad_norm": 0.8507063388824463, |
|
"learning_rate": 3.912250122555695e-05, |
|
"loss": 0.3432, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.6808649708589792, |
|
"grad_norm": 1.8186389207839966, |
|
"learning_rate": 3.8668591244984297e-05, |
|
"loss": 0.3413, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.7080995696933384, |
|
"grad_norm": 1.0689102411270142, |
|
"learning_rate": 3.821468126441165e-05, |
|
"loss": 0.3821, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.7353341685276976, |
|
"grad_norm": 1.7289353609085083, |
|
"learning_rate": 3.776077128383899e-05, |
|
"loss": 0.3861, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.7625687673620568, |
|
"grad_norm": 1.1911722421646118, |
|
"learning_rate": 3.730686130326634e-05, |
|
"loss": 0.3653, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.7898033661964159, |
|
"grad_norm": 6.017147541046143, |
|
"learning_rate": 3.685295132269369e-05, |
|
"loss": 0.3523, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.8170379650307751, |
|
"grad_norm": 0.9723203778266907, |
|
"learning_rate": 3.639904134212103e-05, |
|
"loss": 0.3366, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.8442725638651343, |
|
"grad_norm": 1.8334780931472778, |
|
"learning_rate": 3.594513136154838e-05, |
|
"loss": 0.3933, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.8715071626994935, |
|
"grad_norm": 1.174159049987793, |
|
"learning_rate": 3.549122138097573e-05, |
|
"loss": 0.3676, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.8987417615338527, |
|
"grad_norm": 0.1367327719926834, |
|
"learning_rate": 3.503731140040307e-05, |
|
"loss": 0.3286, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.9259763603682117, |
|
"grad_norm": 2.6567485332489014, |
|
"learning_rate": 3.458340141983042e-05, |
|
"loss": 0.3401, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.9532109592025709, |
|
"grad_norm": 0.11480577290058136, |
|
"learning_rate": 3.412949143925777e-05, |
|
"loss": 0.3858, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.9804455580369301, |
|
"grad_norm": 2.1067185401916504, |
|
"learning_rate": 3.3675581458685113e-05, |
|
"loss": 0.3208, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_runtime": 198.3127, |
|
"eval_samples_per_second": 9.48, |
|
"eval_steps_per_second": 9.48, |
|
"step": 18359 |
|
}, |
|
{ |
|
"epoch": 1.0076801568712892, |
|
"grad_norm": 4.7463698387146, |
|
"learning_rate": 3.3221671478112465e-05, |
|
"loss": 0.3242, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.0349147557056484, |
|
"grad_norm": 0.8721242547035217, |
|
"learning_rate": 3.276776149753981e-05, |
|
"loss": 0.3311, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.0621493545400076, |
|
"grad_norm": 1.6243788003921509, |
|
"learning_rate": 3.231385151696716e-05, |
|
"loss": 0.2998, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.0893839533743668, |
|
"grad_norm": 0.8359081149101257, |
|
"learning_rate": 3.1859941536394505e-05, |
|
"loss": 0.3168, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.116618552208726, |
|
"grad_norm": 0.6658357381820679, |
|
"learning_rate": 3.140603155582185e-05, |
|
"loss": 0.331, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.1438531510430852, |
|
"grad_norm": 0.5795690417289734, |
|
"learning_rate": 3.09521215752492e-05, |
|
"loss": 0.3073, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.1710877498774444, |
|
"grad_norm": 0.18823903799057007, |
|
"learning_rate": 3.0498211594676546e-05, |
|
"loss": 0.2997, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.1983223487118035, |
|
"grad_norm": 0.7759385704994202, |
|
"learning_rate": 3.0044301614103893e-05, |
|
"loss": 0.3181, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.2255569475461627, |
|
"grad_norm": 2.6760952472686768, |
|
"learning_rate": 2.9590391633531238e-05, |
|
"loss": 0.3208, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 1.252791546380522, |
|
"grad_norm": 0.7384393215179443, |
|
"learning_rate": 2.9136481652958586e-05, |
|
"loss": 0.3236, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.280026145214881, |
|
"grad_norm": 0.1822945773601532, |
|
"learning_rate": 2.8682571672385934e-05, |
|
"loss": 0.2944, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 1.30726074404924, |
|
"grad_norm": 1.2044873237609863, |
|
"learning_rate": 2.822866169181328e-05, |
|
"loss": 0.3148, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.3344953428835993, |
|
"grad_norm": 0.12448325008153915, |
|
"learning_rate": 2.7774751711240626e-05, |
|
"loss": 0.32, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 1.3617299417179585, |
|
"grad_norm": 0.1313730776309967, |
|
"learning_rate": 2.7320841730667974e-05, |
|
"loss": 0.2869, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.3889645405523177, |
|
"grad_norm": 0.2766351103782654, |
|
"learning_rate": 2.686693175009532e-05, |
|
"loss": 0.3052, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 1.4161991393866769, |
|
"grad_norm": 1.1278197765350342, |
|
"learning_rate": 2.6413021769522673e-05, |
|
"loss": 0.2979, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.443433738221036, |
|
"grad_norm": 1.9573335647583008, |
|
"learning_rate": 2.5959111788950018e-05, |
|
"loss": 0.3226, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 1.4706683370553952, |
|
"grad_norm": 1.249816656112671, |
|
"learning_rate": 2.5505201808377366e-05, |
|
"loss": 0.3446, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.4979029358897544, |
|
"grad_norm": 1.5611047744750977, |
|
"learning_rate": 2.5051291827804714e-05, |
|
"loss": 0.3295, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 1.5251375347241134, |
|
"grad_norm": 0.2420412003993988, |
|
"learning_rate": 2.4597381847232058e-05, |
|
"loss": 0.3048, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.5523721335584728, |
|
"grad_norm": 0.621634304523468, |
|
"learning_rate": 2.4143471866659406e-05, |
|
"loss": 0.3135, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 1.5796067323928318, |
|
"grad_norm": 0.09876800328493118, |
|
"learning_rate": 2.3689561886086754e-05, |
|
"loss": 0.3231, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 1.606841331227191, |
|
"grad_norm": 0.10343176126480103, |
|
"learning_rate": 2.32356519055141e-05, |
|
"loss": 0.3441, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 1.6340759300615502, |
|
"grad_norm": 0.44668447971343994, |
|
"learning_rate": 2.2781741924941447e-05, |
|
"loss": 0.3297, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.6613105288959094, |
|
"grad_norm": 0.37340623140335083, |
|
"learning_rate": 2.2327831944368795e-05, |
|
"loss": 0.3026, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 1.6885451277302685, |
|
"grad_norm": 0.21011939644813538, |
|
"learning_rate": 2.187392196379614e-05, |
|
"loss": 0.3254, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.7157797265646277, |
|
"grad_norm": 1.6312121152877808, |
|
"learning_rate": 2.1420011983223487e-05, |
|
"loss": 0.3186, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.743014325398987, |
|
"grad_norm": 1.275604248046875, |
|
"learning_rate": 2.096700982261198e-05, |
|
"loss": 0.2971, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.770248924233346, |
|
"grad_norm": 1.6331900358200073, |
|
"learning_rate": 2.0513099842039328e-05, |
|
"loss": 0.3249, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.7974835230677053, |
|
"grad_norm": 1.0726169347763062, |
|
"learning_rate": 2.0059189861466676e-05, |
|
"loss": 0.3159, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.8247181219020643, |
|
"grad_norm": 2.8441109657287598, |
|
"learning_rate": 1.9606187700855166e-05, |
|
"loss": 0.3013, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.8519527207364237, |
|
"grad_norm": 1.9688265323638916, |
|
"learning_rate": 1.915318554024366e-05, |
|
"loss": 0.3048, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.8791873195707827, |
|
"grad_norm": 0.29343387484550476, |
|
"learning_rate": 1.8699275559671007e-05, |
|
"loss": 0.3037, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.9064219184051419, |
|
"grad_norm": 1.4208123683929443, |
|
"learning_rate": 1.8247181219020645e-05, |
|
"loss": 0.3486, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.933656517239501, |
|
"grad_norm": 0.09636660665273666, |
|
"learning_rate": 1.7793271238447993e-05, |
|
"loss": 0.3057, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.9608911160738602, |
|
"grad_norm": 2.4064226150512695, |
|
"learning_rate": 1.7339361257875337e-05, |
|
"loss": 0.2992, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.9881257149082194, |
|
"grad_norm": 0.09306484460830688, |
|
"learning_rate": 1.6885451277302685e-05, |
|
"loss": 0.2987, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_runtime": 197.752, |
|
"eval_samples_per_second": 9.507, |
|
"eval_steps_per_second": 9.507, |
|
"step": 36718 |
|
}, |
|
{ |
|
"epoch": 2.0153603137425784, |
|
"grad_norm": 1.6785128116607666, |
|
"learning_rate": 1.6431541296730033e-05, |
|
"loss": 0.3208, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.042594912576938, |
|
"grad_norm": 3.7624003887176514, |
|
"learning_rate": 1.5978539136118526e-05, |
|
"loss": 0.2908, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 2.069829511411297, |
|
"grad_norm": 0.20085683465003967, |
|
"learning_rate": 1.552462915554587e-05, |
|
"loss": 0.2729, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.097064110245656, |
|
"grad_norm": 0.11236262321472168, |
|
"learning_rate": 1.5070719174973219e-05, |
|
"loss": 0.2664, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 2.124298709080015, |
|
"grad_norm": 2.0708374977111816, |
|
"learning_rate": 1.4616809194400567e-05, |
|
"loss": 0.2511, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.1515333079143746, |
|
"grad_norm": 1.7030911445617676, |
|
"learning_rate": 1.4162899213827916e-05, |
|
"loss": 0.2809, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 2.1787679067487336, |
|
"grad_norm": 0.11112015694379807, |
|
"learning_rate": 1.3708989233255262e-05, |
|
"loss": 0.2294, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.206002505583093, |
|
"grad_norm": 2.3932106494903564, |
|
"learning_rate": 1.3255079252682609e-05, |
|
"loss": 0.2881, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 2.233237104417452, |
|
"grad_norm": 0.9254179000854492, |
|
"learning_rate": 1.2801169272109957e-05, |
|
"loss": 0.2532, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.2604717032518113, |
|
"grad_norm": 0.17265941202640533, |
|
"learning_rate": 1.2347259291537303e-05, |
|
"loss": 0.2502, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 2.2877063020861703, |
|
"grad_norm": 2.3043088912963867, |
|
"learning_rate": 1.189334931096465e-05, |
|
"loss": 0.2799, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.3149409009205293, |
|
"grad_norm": 0.11663592606782913, |
|
"learning_rate": 1.1439439330391997e-05, |
|
"loss": 0.2569, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 2.3421754997548887, |
|
"grad_norm": 2.5327258110046387, |
|
"learning_rate": 1.0986437169780488e-05, |
|
"loss": 0.2735, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 2.3694100985892477, |
|
"grad_norm": 1.2668527364730835, |
|
"learning_rate": 1.0532527189207838e-05, |
|
"loss": 0.2692, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 2.396644697423607, |
|
"grad_norm": 1.1176379919052124, |
|
"learning_rate": 1.0078617208635184e-05, |
|
"loss": 0.2984, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.423879296257966, |
|
"grad_norm": 0.13128969073295593, |
|
"learning_rate": 9.62470722806253e-06, |
|
"loss": 0.2592, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 2.4511138950923255, |
|
"grad_norm": 0.8079116344451904, |
|
"learning_rate": 9.170797247489878e-06, |
|
"loss": 0.2869, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 2.4783484939266844, |
|
"grad_norm": 0.9324661493301392, |
|
"learning_rate": 8.716887266917226e-06, |
|
"loss": 0.2674, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 2.505583092761044, |
|
"grad_norm": 0.18096031248569489, |
|
"learning_rate": 8.262977286344572e-06, |
|
"loss": 0.2852, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.532817691595403, |
|
"grad_norm": 0.13841697573661804, |
|
"learning_rate": 7.811790765655356e-06, |
|
"loss": 0.2747, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 2.560052290429762, |
|
"grad_norm": 2.215595006942749, |
|
"learning_rate": 7.357880785082703e-06, |
|
"loss": 0.2836, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 2.587286889264121, |
|
"grad_norm": 0.17113931477069855, |
|
"learning_rate": 6.90397080451005e-06, |
|
"loss": 0.2791, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 2.61452148809848, |
|
"grad_norm": 1.888545274734497, |
|
"learning_rate": 6.450060823937397e-06, |
|
"loss": 0.2685, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.6417560869328396, |
|
"grad_norm": 0.15251892805099487, |
|
"learning_rate": 5.996150843364744e-06, |
|
"loss": 0.2958, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 2.6689906857671986, |
|
"grad_norm": 2.180168628692627, |
|
"learning_rate": 5.542240862792091e-06, |
|
"loss": 0.2887, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 2.696225284601558, |
|
"grad_norm": 1.863853931427002, |
|
"learning_rate": 5.088330882219438e-06, |
|
"loss": 0.2931, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 2.723459883435917, |
|
"grad_norm": 2.8054542541503906, |
|
"learning_rate": 4.634420901646785e-06, |
|
"loss": 0.2659, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.7506944822702764, |
|
"grad_norm": 3.5175323486328125, |
|
"learning_rate": 4.180510921074133e-06, |
|
"loss": 0.2574, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 2.7779290811046353, |
|
"grad_norm": 0.14439070224761963, |
|
"learning_rate": 3.727508760462625e-06, |
|
"loss": 0.2883, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 2.8051636799389943, |
|
"grad_norm": 0.41310277581214905, |
|
"learning_rate": 3.2735987798899726e-06, |
|
"loss": 0.2963, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 2.8323982787733537, |
|
"grad_norm": 0.3658026158809662, |
|
"learning_rate": 2.8196887993173193e-06, |
|
"loss": 0.2577, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.8596328776077127, |
|
"grad_norm": 0.11469651013612747, |
|
"learning_rate": 2.365778818744667e-06, |
|
"loss": 0.2784, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 2.886867476442072, |
|
"grad_norm": 0.16579371690750122, |
|
"learning_rate": 1.911868838172014e-06, |
|
"loss": 0.2777, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 2.914102075276431, |
|
"grad_norm": 3.678469657897949, |
|
"learning_rate": 1.457958857599361e-06, |
|
"loss": 0.2688, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 2.9413366741107905, |
|
"grad_norm": 0.30534350872039795, |
|
"learning_rate": 1.0040488770267082e-06, |
|
"loss": 0.2776, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 2.9685712729451494, |
|
"grad_norm": 0.42191004753112793, |
|
"learning_rate": 5.510467164152006e-07, |
|
"loss": 0.2729, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 2.995805871779509, |
|
"grad_norm": 1.7490407228469849, |
|
"learning_rate": 9.71367358425477e-08, |
|
"loss": 0.2829, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_runtime": 197.6032, |
|
"eval_samples_per_second": 9.514, |
|
"eval_steps_per_second": 9.514, |
|
"step": 55077 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 55077, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4605508716999475e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|