gemma2-2b-it-finetuned-paperqa / trainer_state.json
halyn's picture
feat: upload model
9b2921f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 55077,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02723459883435917,
"grad_norm": 1.1466608047485352,
"learning_rate": 4.9558799498883386e-05,
"loss": 1.2147,
"step": 500
},
{
"epoch": 0.05446919766871834,
"grad_norm": 3.79953932762146,
"learning_rate": 4.910852079815531e-05,
"loss": 0.4931,
"step": 1000
},
{
"epoch": 0.0817037965030775,
"grad_norm": 2.2004730701446533,
"learning_rate": 4.8654610817582656e-05,
"loss": 0.508,
"step": 1500
},
{
"epoch": 0.10893839533743668,
"grad_norm": 5.668378829956055,
"learning_rate": 4.820070083701001e-05,
"loss": 0.7482,
"step": 2000
},
{
"epoch": 0.13617299417179585,
"grad_norm": 2.5953574180603027,
"learning_rate": 4.774679085643735e-05,
"loss": 0.6969,
"step": 2500
},
{
"epoch": 0.163407593006155,
"grad_norm": 1.506287932395935,
"learning_rate": 4.72928808758647e-05,
"loss": 0.501,
"step": 3000
},
{
"epoch": 0.1906421918405142,
"grad_norm": 1.627192735671997,
"learning_rate": 4.683897089529205e-05,
"loss": 0.5321,
"step": 3500
},
{
"epoch": 0.21787679067487337,
"grad_norm": 1.1909410953521729,
"learning_rate": 4.63850609147194e-05,
"loss": 0.5486,
"step": 4000
},
{
"epoch": 0.24511138950923253,
"grad_norm": 1.7642792463302612,
"learning_rate": 4.5931150934146743e-05,
"loss": 0.5043,
"step": 4500
},
{
"epoch": 0.2723459883435917,
"grad_norm": 2.2761762142181396,
"learning_rate": 4.547724095357409e-05,
"loss": 0.4645,
"step": 5000
},
{
"epoch": 0.2995805871779509,
"grad_norm": 2.3419032096862793,
"learning_rate": 4.502333097300144e-05,
"loss": 0.4197,
"step": 5500
},
{
"epoch": 0.32681518601231,
"grad_norm": 3.043858051300049,
"learning_rate": 4.4569420992428784e-05,
"loss": 0.4128,
"step": 6000
},
{
"epoch": 0.3540497848466692,
"grad_norm": 10.96549129486084,
"learning_rate": 4.411551101185613e-05,
"loss": 0.4257,
"step": 6500
},
{
"epoch": 0.3812843836810284,
"grad_norm": 2.053966760635376,
"learning_rate": 4.366160103128348e-05,
"loss": 0.3825,
"step": 7000
},
{
"epoch": 0.40851898251538754,
"grad_norm": 2.4897701740264893,
"learning_rate": 4.3207691050710824e-05,
"loss": 0.3699,
"step": 7500
},
{
"epoch": 0.43575358134974673,
"grad_norm": 0.603682816028595,
"learning_rate": 4.275378107013817e-05,
"loss": 0.4291,
"step": 8000
},
{
"epoch": 0.46298818018410587,
"grad_norm": 0.8895764350891113,
"learning_rate": 4.229987108956552e-05,
"loss": 0.3847,
"step": 8500
},
{
"epoch": 0.49022277901846506,
"grad_norm": 0.43345028162002563,
"learning_rate": 4.1845961108992865e-05,
"loss": 0.3642,
"step": 9000
},
{
"epoch": 0.5174573778528242,
"grad_norm": 1.6731306314468384,
"learning_rate": 4.139205112842021e-05,
"loss": 0.3751,
"step": 9500
},
{
"epoch": 0.5446919766871834,
"grad_norm": 1.6484122276306152,
"learning_rate": 4.093814114784756e-05,
"loss": 0.4035,
"step": 10000
},
{
"epoch": 0.5719265755215426,
"grad_norm": 1.7121918201446533,
"learning_rate": 4.0484231167274905e-05,
"loss": 0.3796,
"step": 10500
},
{
"epoch": 0.5991611743559018,
"grad_norm": 2.6948976516723633,
"learning_rate": 4.0030321186702256e-05,
"loss": 0.372,
"step": 11000
},
{
"epoch": 0.626395773190261,
"grad_norm": 3.9049389362335205,
"learning_rate": 3.957641120612961e-05,
"loss": 0.3455,
"step": 11500
},
{
"epoch": 0.65363037202462,
"grad_norm": 0.8507063388824463,
"learning_rate": 3.912250122555695e-05,
"loss": 0.3432,
"step": 12000
},
{
"epoch": 0.6808649708589792,
"grad_norm": 1.8186389207839966,
"learning_rate": 3.8668591244984297e-05,
"loss": 0.3413,
"step": 12500
},
{
"epoch": 0.7080995696933384,
"grad_norm": 1.0689102411270142,
"learning_rate": 3.821468126441165e-05,
"loss": 0.3821,
"step": 13000
},
{
"epoch": 0.7353341685276976,
"grad_norm": 1.7289353609085083,
"learning_rate": 3.776077128383899e-05,
"loss": 0.3861,
"step": 13500
},
{
"epoch": 0.7625687673620568,
"grad_norm": 1.1911722421646118,
"learning_rate": 3.730686130326634e-05,
"loss": 0.3653,
"step": 14000
},
{
"epoch": 0.7898033661964159,
"grad_norm": 6.017147541046143,
"learning_rate": 3.685295132269369e-05,
"loss": 0.3523,
"step": 14500
},
{
"epoch": 0.8170379650307751,
"grad_norm": 0.9723203778266907,
"learning_rate": 3.639904134212103e-05,
"loss": 0.3366,
"step": 15000
},
{
"epoch": 0.8442725638651343,
"grad_norm": 1.8334780931472778,
"learning_rate": 3.594513136154838e-05,
"loss": 0.3933,
"step": 15500
},
{
"epoch": 0.8715071626994935,
"grad_norm": 1.174159049987793,
"learning_rate": 3.549122138097573e-05,
"loss": 0.3676,
"step": 16000
},
{
"epoch": 0.8987417615338527,
"grad_norm": 0.1367327719926834,
"learning_rate": 3.503731140040307e-05,
"loss": 0.3286,
"step": 16500
},
{
"epoch": 0.9259763603682117,
"grad_norm": 2.6567485332489014,
"learning_rate": 3.458340141983042e-05,
"loss": 0.3401,
"step": 17000
},
{
"epoch": 0.9532109592025709,
"grad_norm": 0.11480577290058136,
"learning_rate": 3.412949143925777e-05,
"loss": 0.3858,
"step": 17500
},
{
"epoch": 0.9804455580369301,
"grad_norm": 2.1067185401916504,
"learning_rate": 3.3675581458685113e-05,
"loss": 0.3208,
"step": 18000
},
{
"epoch": 1.0,
"eval_runtime": 198.3127,
"eval_samples_per_second": 9.48,
"eval_steps_per_second": 9.48,
"step": 18359
},
{
"epoch": 1.0076801568712892,
"grad_norm": 4.7463698387146,
"learning_rate": 3.3221671478112465e-05,
"loss": 0.3242,
"step": 18500
},
{
"epoch": 1.0349147557056484,
"grad_norm": 0.8721242547035217,
"learning_rate": 3.276776149753981e-05,
"loss": 0.3311,
"step": 19000
},
{
"epoch": 1.0621493545400076,
"grad_norm": 1.6243788003921509,
"learning_rate": 3.231385151696716e-05,
"loss": 0.2998,
"step": 19500
},
{
"epoch": 1.0893839533743668,
"grad_norm": 0.8359081149101257,
"learning_rate": 3.1859941536394505e-05,
"loss": 0.3168,
"step": 20000
},
{
"epoch": 1.116618552208726,
"grad_norm": 0.6658357381820679,
"learning_rate": 3.140603155582185e-05,
"loss": 0.331,
"step": 20500
},
{
"epoch": 1.1438531510430852,
"grad_norm": 0.5795690417289734,
"learning_rate": 3.09521215752492e-05,
"loss": 0.3073,
"step": 21000
},
{
"epoch": 1.1710877498774444,
"grad_norm": 0.18823903799057007,
"learning_rate": 3.0498211594676546e-05,
"loss": 0.2997,
"step": 21500
},
{
"epoch": 1.1983223487118035,
"grad_norm": 0.7759385704994202,
"learning_rate": 3.0044301614103893e-05,
"loss": 0.3181,
"step": 22000
},
{
"epoch": 1.2255569475461627,
"grad_norm": 2.6760952472686768,
"learning_rate": 2.9590391633531238e-05,
"loss": 0.3208,
"step": 22500
},
{
"epoch": 1.252791546380522,
"grad_norm": 0.7384393215179443,
"learning_rate": 2.9136481652958586e-05,
"loss": 0.3236,
"step": 23000
},
{
"epoch": 1.280026145214881,
"grad_norm": 0.1822945773601532,
"learning_rate": 2.8682571672385934e-05,
"loss": 0.2944,
"step": 23500
},
{
"epoch": 1.30726074404924,
"grad_norm": 1.2044873237609863,
"learning_rate": 2.822866169181328e-05,
"loss": 0.3148,
"step": 24000
},
{
"epoch": 1.3344953428835993,
"grad_norm": 0.12448325008153915,
"learning_rate": 2.7774751711240626e-05,
"loss": 0.32,
"step": 24500
},
{
"epoch": 1.3617299417179585,
"grad_norm": 0.1313730776309967,
"learning_rate": 2.7320841730667974e-05,
"loss": 0.2869,
"step": 25000
},
{
"epoch": 1.3889645405523177,
"grad_norm": 0.2766351103782654,
"learning_rate": 2.686693175009532e-05,
"loss": 0.3052,
"step": 25500
},
{
"epoch": 1.4161991393866769,
"grad_norm": 1.1278197765350342,
"learning_rate": 2.6413021769522673e-05,
"loss": 0.2979,
"step": 26000
},
{
"epoch": 1.443433738221036,
"grad_norm": 1.9573335647583008,
"learning_rate": 2.5959111788950018e-05,
"loss": 0.3226,
"step": 26500
},
{
"epoch": 1.4706683370553952,
"grad_norm": 1.249816656112671,
"learning_rate": 2.5505201808377366e-05,
"loss": 0.3446,
"step": 27000
},
{
"epoch": 1.4979029358897544,
"grad_norm": 1.5611047744750977,
"learning_rate": 2.5051291827804714e-05,
"loss": 0.3295,
"step": 27500
},
{
"epoch": 1.5251375347241134,
"grad_norm": 0.2420412003993988,
"learning_rate": 2.4597381847232058e-05,
"loss": 0.3048,
"step": 28000
},
{
"epoch": 1.5523721335584728,
"grad_norm": 0.621634304523468,
"learning_rate": 2.4143471866659406e-05,
"loss": 0.3135,
"step": 28500
},
{
"epoch": 1.5796067323928318,
"grad_norm": 0.09876800328493118,
"learning_rate": 2.3689561886086754e-05,
"loss": 0.3231,
"step": 29000
},
{
"epoch": 1.606841331227191,
"grad_norm": 0.10343176126480103,
"learning_rate": 2.32356519055141e-05,
"loss": 0.3441,
"step": 29500
},
{
"epoch": 1.6340759300615502,
"grad_norm": 0.44668447971343994,
"learning_rate": 2.2781741924941447e-05,
"loss": 0.3297,
"step": 30000
},
{
"epoch": 1.6613105288959094,
"grad_norm": 0.37340623140335083,
"learning_rate": 2.2327831944368795e-05,
"loss": 0.3026,
"step": 30500
},
{
"epoch": 1.6885451277302685,
"grad_norm": 0.21011939644813538,
"learning_rate": 2.187392196379614e-05,
"loss": 0.3254,
"step": 31000
},
{
"epoch": 1.7157797265646277,
"grad_norm": 1.6312121152877808,
"learning_rate": 2.1420011983223487e-05,
"loss": 0.3186,
"step": 31500
},
{
"epoch": 1.743014325398987,
"grad_norm": 1.275604248046875,
"learning_rate": 2.096700982261198e-05,
"loss": 0.2971,
"step": 32000
},
{
"epoch": 1.770248924233346,
"grad_norm": 1.6331900358200073,
"learning_rate": 2.0513099842039328e-05,
"loss": 0.3249,
"step": 32500
},
{
"epoch": 1.7974835230677053,
"grad_norm": 1.0726169347763062,
"learning_rate": 2.0059189861466676e-05,
"loss": 0.3159,
"step": 33000
},
{
"epoch": 1.8247181219020643,
"grad_norm": 2.8441109657287598,
"learning_rate": 1.9606187700855166e-05,
"loss": 0.3013,
"step": 33500
},
{
"epoch": 1.8519527207364237,
"grad_norm": 1.9688265323638916,
"learning_rate": 1.915318554024366e-05,
"loss": 0.3048,
"step": 34000
},
{
"epoch": 1.8791873195707827,
"grad_norm": 0.29343387484550476,
"learning_rate": 1.8699275559671007e-05,
"loss": 0.3037,
"step": 34500
},
{
"epoch": 1.9064219184051419,
"grad_norm": 1.4208123683929443,
"learning_rate": 1.8247181219020645e-05,
"loss": 0.3486,
"step": 35000
},
{
"epoch": 1.933656517239501,
"grad_norm": 0.09636660665273666,
"learning_rate": 1.7793271238447993e-05,
"loss": 0.3057,
"step": 35500
},
{
"epoch": 1.9608911160738602,
"grad_norm": 2.4064226150512695,
"learning_rate": 1.7339361257875337e-05,
"loss": 0.2992,
"step": 36000
},
{
"epoch": 1.9881257149082194,
"grad_norm": 0.09306484460830688,
"learning_rate": 1.6885451277302685e-05,
"loss": 0.2987,
"step": 36500
},
{
"epoch": 2.0,
"eval_runtime": 197.752,
"eval_samples_per_second": 9.507,
"eval_steps_per_second": 9.507,
"step": 36718
},
{
"epoch": 2.0153603137425784,
"grad_norm": 1.6785128116607666,
"learning_rate": 1.6431541296730033e-05,
"loss": 0.3208,
"step": 37000
},
{
"epoch": 2.042594912576938,
"grad_norm": 3.7624003887176514,
"learning_rate": 1.5978539136118526e-05,
"loss": 0.2908,
"step": 37500
},
{
"epoch": 2.069829511411297,
"grad_norm": 0.20085683465003967,
"learning_rate": 1.552462915554587e-05,
"loss": 0.2729,
"step": 38000
},
{
"epoch": 2.097064110245656,
"grad_norm": 0.11236262321472168,
"learning_rate": 1.5070719174973219e-05,
"loss": 0.2664,
"step": 38500
},
{
"epoch": 2.124298709080015,
"grad_norm": 2.0708374977111816,
"learning_rate": 1.4616809194400567e-05,
"loss": 0.2511,
"step": 39000
},
{
"epoch": 2.1515333079143746,
"grad_norm": 1.7030911445617676,
"learning_rate": 1.4162899213827916e-05,
"loss": 0.2809,
"step": 39500
},
{
"epoch": 2.1787679067487336,
"grad_norm": 0.11112015694379807,
"learning_rate": 1.3708989233255262e-05,
"loss": 0.2294,
"step": 40000
},
{
"epoch": 2.206002505583093,
"grad_norm": 2.3932106494903564,
"learning_rate": 1.3255079252682609e-05,
"loss": 0.2881,
"step": 40500
},
{
"epoch": 2.233237104417452,
"grad_norm": 0.9254179000854492,
"learning_rate": 1.2801169272109957e-05,
"loss": 0.2532,
"step": 41000
},
{
"epoch": 2.2604717032518113,
"grad_norm": 0.17265941202640533,
"learning_rate": 1.2347259291537303e-05,
"loss": 0.2502,
"step": 41500
},
{
"epoch": 2.2877063020861703,
"grad_norm": 2.3043088912963867,
"learning_rate": 1.189334931096465e-05,
"loss": 0.2799,
"step": 42000
},
{
"epoch": 2.3149409009205293,
"grad_norm": 0.11663592606782913,
"learning_rate": 1.1439439330391997e-05,
"loss": 0.2569,
"step": 42500
},
{
"epoch": 2.3421754997548887,
"grad_norm": 2.5327258110046387,
"learning_rate": 1.0986437169780488e-05,
"loss": 0.2735,
"step": 43000
},
{
"epoch": 2.3694100985892477,
"grad_norm": 1.2668527364730835,
"learning_rate": 1.0532527189207838e-05,
"loss": 0.2692,
"step": 43500
},
{
"epoch": 2.396644697423607,
"grad_norm": 1.1176379919052124,
"learning_rate": 1.0078617208635184e-05,
"loss": 0.2984,
"step": 44000
},
{
"epoch": 2.423879296257966,
"grad_norm": 0.13128969073295593,
"learning_rate": 9.62470722806253e-06,
"loss": 0.2592,
"step": 44500
},
{
"epoch": 2.4511138950923255,
"grad_norm": 0.8079116344451904,
"learning_rate": 9.170797247489878e-06,
"loss": 0.2869,
"step": 45000
},
{
"epoch": 2.4783484939266844,
"grad_norm": 0.9324661493301392,
"learning_rate": 8.716887266917226e-06,
"loss": 0.2674,
"step": 45500
},
{
"epoch": 2.505583092761044,
"grad_norm": 0.18096031248569489,
"learning_rate": 8.262977286344572e-06,
"loss": 0.2852,
"step": 46000
},
{
"epoch": 2.532817691595403,
"grad_norm": 0.13841697573661804,
"learning_rate": 7.811790765655356e-06,
"loss": 0.2747,
"step": 46500
},
{
"epoch": 2.560052290429762,
"grad_norm": 2.215595006942749,
"learning_rate": 7.357880785082703e-06,
"loss": 0.2836,
"step": 47000
},
{
"epoch": 2.587286889264121,
"grad_norm": 0.17113931477069855,
"learning_rate": 6.90397080451005e-06,
"loss": 0.2791,
"step": 47500
},
{
"epoch": 2.61452148809848,
"grad_norm": 1.888545274734497,
"learning_rate": 6.450060823937397e-06,
"loss": 0.2685,
"step": 48000
},
{
"epoch": 2.6417560869328396,
"grad_norm": 0.15251892805099487,
"learning_rate": 5.996150843364744e-06,
"loss": 0.2958,
"step": 48500
},
{
"epoch": 2.6689906857671986,
"grad_norm": 2.180168628692627,
"learning_rate": 5.542240862792091e-06,
"loss": 0.2887,
"step": 49000
},
{
"epoch": 2.696225284601558,
"grad_norm": 1.863853931427002,
"learning_rate": 5.088330882219438e-06,
"loss": 0.2931,
"step": 49500
},
{
"epoch": 2.723459883435917,
"grad_norm": 2.8054542541503906,
"learning_rate": 4.634420901646785e-06,
"loss": 0.2659,
"step": 50000
},
{
"epoch": 2.7506944822702764,
"grad_norm": 3.5175323486328125,
"learning_rate": 4.180510921074133e-06,
"loss": 0.2574,
"step": 50500
},
{
"epoch": 2.7779290811046353,
"grad_norm": 0.14439070224761963,
"learning_rate": 3.727508760462625e-06,
"loss": 0.2883,
"step": 51000
},
{
"epoch": 2.8051636799389943,
"grad_norm": 0.41310277581214905,
"learning_rate": 3.2735987798899726e-06,
"loss": 0.2963,
"step": 51500
},
{
"epoch": 2.8323982787733537,
"grad_norm": 0.3658026158809662,
"learning_rate": 2.8196887993173193e-06,
"loss": 0.2577,
"step": 52000
},
{
"epoch": 2.8596328776077127,
"grad_norm": 0.11469651013612747,
"learning_rate": 2.365778818744667e-06,
"loss": 0.2784,
"step": 52500
},
{
"epoch": 2.886867476442072,
"grad_norm": 0.16579371690750122,
"learning_rate": 1.911868838172014e-06,
"loss": 0.2777,
"step": 53000
},
{
"epoch": 2.914102075276431,
"grad_norm": 3.678469657897949,
"learning_rate": 1.457958857599361e-06,
"loss": 0.2688,
"step": 53500
},
{
"epoch": 2.9413366741107905,
"grad_norm": 0.30534350872039795,
"learning_rate": 1.0040488770267082e-06,
"loss": 0.2776,
"step": 54000
},
{
"epoch": 2.9685712729451494,
"grad_norm": 0.42191004753112793,
"learning_rate": 5.510467164152006e-07,
"loss": 0.2729,
"step": 54500
},
{
"epoch": 2.995805871779509,
"grad_norm": 1.7490407228469849,
"learning_rate": 9.71367358425477e-08,
"loss": 0.2829,
"step": 55000
},
{
"epoch": 3.0,
"eval_runtime": 197.6032,
"eval_samples_per_second": 9.514,
"eval_steps_per_second": 9.514,
"step": 55077
}
],
"logging_steps": 500,
"max_steps": 55077,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.4605508716999475e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}