trash_vit_trail / trainer_state.json
shng2025's picture
🍻 cheers
ca17756 verified
raw
history blame contribute delete
No virus
80.1 kB
{
"best_metric": 0.0710952952504158,
"best_model_checkpoint": "./vit-base-trash-demo-v5/checkpoint-4000",
"epoch": 4.0,
"eval_steps": 1000,
"global_step": 4476,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008936550491510277,
"grad_norm": 2.2266733646392822,
"learning_rate": 0.00019955317247542448,
"loss": 1.7283,
"step": 10
},
{
"epoch": 0.017873100983020553,
"grad_norm": 1.7498841285705566,
"learning_rate": 0.000199106344950849,
"loss": 1.2864,
"step": 20
},
{
"epoch": 0.02680965147453083,
"grad_norm": 2.307732582092285,
"learning_rate": 0.00019865951742627347,
"loss": 0.9102,
"step": 30
},
{
"epoch": 0.035746201966041107,
"grad_norm": 4.274341106414795,
"learning_rate": 0.00019821268990169794,
"loss": 0.7496,
"step": 40
},
{
"epoch": 0.044682752457551385,
"grad_norm": 2.586291551589966,
"learning_rate": 0.00019776586237712246,
"loss": 0.7103,
"step": 50
},
{
"epoch": 0.05361930294906166,
"grad_norm": 1.1641569137573242,
"learning_rate": 0.00019731903485254693,
"loss": 0.5292,
"step": 60
},
{
"epoch": 0.06255585344057193,
"grad_norm": 5.71007776260376,
"learning_rate": 0.0001968722073279714,
"loss": 0.5169,
"step": 70
},
{
"epoch": 0.07149240393208221,
"grad_norm": 4.907001495361328,
"learning_rate": 0.0001964253798033959,
"loss": 0.5468,
"step": 80
},
{
"epoch": 0.08042895442359249,
"grad_norm": 4.788897514343262,
"learning_rate": 0.00019597855227882039,
"loss": 0.5174,
"step": 90
},
{
"epoch": 0.08936550491510277,
"grad_norm": 3.1065032482147217,
"learning_rate": 0.00019553172475424485,
"loss": 0.4598,
"step": 100
},
{
"epoch": 0.09830205540661305,
"grad_norm": 2.299734115600586,
"learning_rate": 0.00019508489722966935,
"loss": 0.3711,
"step": 110
},
{
"epoch": 0.10723860589812333,
"grad_norm": 1.3453820943832397,
"learning_rate": 0.00019463806970509384,
"loss": 0.4316,
"step": 120
},
{
"epoch": 0.1161751563896336,
"grad_norm": 2.797497034072876,
"learning_rate": 0.00019419124218051834,
"loss": 0.5008,
"step": 130
},
{
"epoch": 0.12511170688114387,
"grad_norm": 3.164219856262207,
"learning_rate": 0.0001937444146559428,
"loss": 0.371,
"step": 140
},
{
"epoch": 0.13404825737265416,
"grad_norm": 5.14756965637207,
"learning_rate": 0.0001932975871313673,
"loss": 0.5617,
"step": 150
},
{
"epoch": 0.14298480786416443,
"grad_norm": 1.478502869606018,
"learning_rate": 0.0001928507596067918,
"loss": 0.3317,
"step": 160
},
{
"epoch": 0.15192135835567472,
"grad_norm": 1.3914997577667236,
"learning_rate": 0.00019240393208221627,
"loss": 0.3637,
"step": 170
},
{
"epoch": 0.16085790884718498,
"grad_norm": 2.963843822479248,
"learning_rate": 0.00019195710455764076,
"loss": 0.3628,
"step": 180
},
{
"epoch": 0.16979445933869527,
"grad_norm": 4.369801044464111,
"learning_rate": 0.00019151027703306526,
"loss": 0.4182,
"step": 190
},
{
"epoch": 0.17873100983020554,
"grad_norm": 2.109142541885376,
"learning_rate": 0.00019106344950848973,
"loss": 0.279,
"step": 200
},
{
"epoch": 0.1876675603217158,
"grad_norm": 5.093620777130127,
"learning_rate": 0.00019061662198391422,
"loss": 0.4936,
"step": 210
},
{
"epoch": 0.1966041108132261,
"grad_norm": 2.2940657138824463,
"learning_rate": 0.00019016979445933872,
"loss": 0.532,
"step": 220
},
{
"epoch": 0.20554066130473636,
"grad_norm": 3.415463924407959,
"learning_rate": 0.00018972296693476319,
"loss": 0.3475,
"step": 230
},
{
"epoch": 0.21447721179624665,
"grad_norm": 1.158109426498413,
"learning_rate": 0.00018927613941018768,
"loss": 0.386,
"step": 240
},
{
"epoch": 0.22341376228775692,
"grad_norm": 0.44666406512260437,
"learning_rate": 0.00018882931188561218,
"loss": 0.4165,
"step": 250
},
{
"epoch": 0.2323503127792672,
"grad_norm": 2.626112461090088,
"learning_rate": 0.00018838248436103664,
"loss": 0.3478,
"step": 260
},
{
"epoch": 0.24128686327077747,
"grad_norm": 3.9105656147003174,
"learning_rate": 0.00018793565683646114,
"loss": 0.293,
"step": 270
},
{
"epoch": 0.25022341376228774,
"grad_norm": 6.075140953063965,
"learning_rate": 0.00018748882931188563,
"loss": 0.4013,
"step": 280
},
{
"epoch": 0.25915996425379806,
"grad_norm": 1.2540974617004395,
"learning_rate": 0.0001870420017873101,
"loss": 0.5419,
"step": 290
},
{
"epoch": 0.2680965147453083,
"grad_norm": 2.7805850505828857,
"learning_rate": 0.00018659517426273457,
"loss": 0.373,
"step": 300
},
{
"epoch": 0.2770330652368186,
"grad_norm": 1.9444429874420166,
"learning_rate": 0.0001861483467381591,
"loss": 0.2447,
"step": 310
},
{
"epoch": 0.28596961572832885,
"grad_norm": 2.164452075958252,
"learning_rate": 0.00018570151921358356,
"loss": 0.4141,
"step": 320
},
{
"epoch": 0.2949061662198391,
"grad_norm": 4.439435958862305,
"learning_rate": 0.00018525469168900803,
"loss": 0.3183,
"step": 330
},
{
"epoch": 0.30384271671134944,
"grad_norm": 3.332730770111084,
"learning_rate": 0.00018480786416443255,
"loss": 0.3255,
"step": 340
},
{
"epoch": 0.3127792672028597,
"grad_norm": 4.461299419403076,
"learning_rate": 0.00018436103663985702,
"loss": 0.2733,
"step": 350
},
{
"epoch": 0.32171581769436997,
"grad_norm": 4.637039661407471,
"learning_rate": 0.0001839142091152815,
"loss": 0.2486,
"step": 360
},
{
"epoch": 0.33065236818588023,
"grad_norm": 1.985630989074707,
"learning_rate": 0.000183467381590706,
"loss": 0.2978,
"step": 370
},
{
"epoch": 0.33958891867739055,
"grad_norm": 2.7530932426452637,
"learning_rate": 0.00018302055406613048,
"loss": 0.3722,
"step": 380
},
{
"epoch": 0.3485254691689008,
"grad_norm": 0.20281237363815308,
"learning_rate": 0.00018257372654155497,
"loss": 0.3897,
"step": 390
},
{
"epoch": 0.3574620196604111,
"grad_norm": 1.9220471382141113,
"learning_rate": 0.00018212689901697947,
"loss": 0.2564,
"step": 400
},
{
"epoch": 0.36639857015192134,
"grad_norm": 4.841084957122803,
"learning_rate": 0.00018168007149240394,
"loss": 0.3892,
"step": 410
},
{
"epoch": 0.3753351206434316,
"grad_norm": 5.499583721160889,
"learning_rate": 0.00018123324396782843,
"loss": 0.3462,
"step": 420
},
{
"epoch": 0.38427167113494193,
"grad_norm": 1.4320725202560425,
"learning_rate": 0.0001807864164432529,
"loss": 0.2658,
"step": 430
},
{
"epoch": 0.3932082216264522,
"grad_norm": 2.9739625453948975,
"learning_rate": 0.0001803395889186774,
"loss": 0.269,
"step": 440
},
{
"epoch": 0.40214477211796246,
"grad_norm": 2.0247411727905273,
"learning_rate": 0.0001798927613941019,
"loss": 0.3355,
"step": 450
},
{
"epoch": 0.4110813226094727,
"grad_norm": 0.25784507393836975,
"learning_rate": 0.00017944593386952636,
"loss": 0.2589,
"step": 460
},
{
"epoch": 0.42001787310098304,
"grad_norm": 8.2475004196167,
"learning_rate": 0.00017899910634495086,
"loss": 0.3034,
"step": 470
},
{
"epoch": 0.4289544235924933,
"grad_norm": 2.1054959297180176,
"learning_rate": 0.00017855227882037535,
"loss": 0.3784,
"step": 480
},
{
"epoch": 0.43789097408400357,
"grad_norm": 2.6620118618011475,
"learning_rate": 0.00017810545129579982,
"loss": 0.3303,
"step": 490
},
{
"epoch": 0.44682752457551383,
"grad_norm": 2.4308674335479736,
"learning_rate": 0.00017765862377122431,
"loss": 0.4428,
"step": 500
},
{
"epoch": 0.45576407506702415,
"grad_norm": 4.620336532592773,
"learning_rate": 0.0001772117962466488,
"loss": 0.4007,
"step": 510
},
{
"epoch": 0.4647006255585344,
"grad_norm": 0.6208035945892334,
"learning_rate": 0.00017676496872207328,
"loss": 0.4641,
"step": 520
},
{
"epoch": 0.4736371760500447,
"grad_norm": 3.478276252746582,
"learning_rate": 0.00017631814119749777,
"loss": 0.3107,
"step": 530
},
{
"epoch": 0.48257372654155495,
"grad_norm": 2.8934295177459717,
"learning_rate": 0.00017587131367292227,
"loss": 0.2283,
"step": 540
},
{
"epoch": 0.4915102770330652,
"grad_norm": 2.323265552520752,
"learning_rate": 0.00017542448614834674,
"loss": 0.253,
"step": 550
},
{
"epoch": 0.5004468275245755,
"grad_norm": 0.8551294207572937,
"learning_rate": 0.00017497765862377123,
"loss": 0.2903,
"step": 560
},
{
"epoch": 0.5093833780160858,
"grad_norm": 3.454586982727051,
"learning_rate": 0.00017453083109919573,
"loss": 0.2434,
"step": 570
},
{
"epoch": 0.5183199285075961,
"grad_norm": 4.937902927398682,
"learning_rate": 0.0001740840035746202,
"loss": 0.254,
"step": 580
},
{
"epoch": 0.5272564789991063,
"grad_norm": 7.016292095184326,
"learning_rate": 0.0001736371760500447,
"loss": 0.3365,
"step": 590
},
{
"epoch": 0.5361930294906166,
"grad_norm": 4.799339294433594,
"learning_rate": 0.0001731903485254692,
"loss": 0.2639,
"step": 600
},
{
"epoch": 0.5451295799821269,
"grad_norm": 3.4884583950042725,
"learning_rate": 0.00017274352100089365,
"loss": 0.2255,
"step": 610
},
{
"epoch": 0.5540661304736372,
"grad_norm": 1.6748002767562866,
"learning_rate": 0.00017229669347631815,
"loss": 0.2678,
"step": 620
},
{
"epoch": 0.5630026809651475,
"grad_norm": 4.145753383636475,
"learning_rate": 0.00017184986595174265,
"loss": 0.3435,
"step": 630
},
{
"epoch": 0.5719392314566577,
"grad_norm": 3.8941946029663086,
"learning_rate": 0.00017140303842716711,
"loss": 0.3448,
"step": 640
},
{
"epoch": 0.580875781948168,
"grad_norm": 2.7980730533599854,
"learning_rate": 0.0001709562109025916,
"loss": 0.1776,
"step": 650
},
{
"epoch": 0.5898123324396782,
"grad_norm": 2.6846330165863037,
"learning_rate": 0.0001705093833780161,
"loss": 0.1698,
"step": 660
},
{
"epoch": 0.5987488829311886,
"grad_norm": 0.6026754379272461,
"learning_rate": 0.00017006255585344057,
"loss": 0.2683,
"step": 670
},
{
"epoch": 0.6076854334226989,
"grad_norm": 1.7795771360397339,
"learning_rate": 0.00016961572832886507,
"loss": 0.1734,
"step": 680
},
{
"epoch": 0.6166219839142091,
"grad_norm": 2.9793999195098877,
"learning_rate": 0.00016916890080428956,
"loss": 0.2341,
"step": 690
},
{
"epoch": 0.6255585344057194,
"grad_norm": 2.885993480682373,
"learning_rate": 0.00016872207327971403,
"loss": 0.1787,
"step": 700
},
{
"epoch": 0.6344950848972297,
"grad_norm": 0.12324349582195282,
"learning_rate": 0.00016827524575513853,
"loss": 0.1716,
"step": 710
},
{
"epoch": 0.6434316353887399,
"grad_norm": 3.5133144855499268,
"learning_rate": 0.00016782841823056302,
"loss": 0.248,
"step": 720
},
{
"epoch": 0.6523681858802503,
"grad_norm": 0.16779197752475739,
"learning_rate": 0.0001673815907059875,
"loss": 0.1969,
"step": 730
},
{
"epoch": 0.6613047363717605,
"grad_norm": 6.696399688720703,
"learning_rate": 0.00016693476318141199,
"loss": 0.2659,
"step": 740
},
{
"epoch": 0.6702412868632708,
"grad_norm": 3.4363462924957275,
"learning_rate": 0.00016648793565683648,
"loss": 0.1933,
"step": 750
},
{
"epoch": 0.6791778373547811,
"grad_norm": 2.9766454696655273,
"learning_rate": 0.00016604110813226095,
"loss": 0.3646,
"step": 760
},
{
"epoch": 0.6881143878462913,
"grad_norm": 3.3751492500305176,
"learning_rate": 0.00016559428060768544,
"loss": 0.2365,
"step": 770
},
{
"epoch": 0.6970509383378016,
"grad_norm": 1.6829200983047485,
"learning_rate": 0.00016514745308310994,
"loss": 0.3954,
"step": 780
},
{
"epoch": 0.7059874888293118,
"grad_norm": 3.473019599914551,
"learning_rate": 0.0001647006255585344,
"loss": 0.2257,
"step": 790
},
{
"epoch": 0.7149240393208222,
"grad_norm": 2.1625287532806396,
"learning_rate": 0.0001642537980339589,
"loss": 0.2984,
"step": 800
},
{
"epoch": 0.7238605898123325,
"grad_norm": 1.2086807489395142,
"learning_rate": 0.00016380697050938337,
"loss": 0.2888,
"step": 810
},
{
"epoch": 0.7327971403038427,
"grad_norm": 0.19183319807052612,
"learning_rate": 0.00016336014298480787,
"loss": 0.2884,
"step": 820
},
{
"epoch": 0.741733690795353,
"grad_norm": 4.687781810760498,
"learning_rate": 0.00016291331546023236,
"loss": 0.2456,
"step": 830
},
{
"epoch": 0.7506702412868632,
"grad_norm": 1.6150999069213867,
"learning_rate": 0.00016246648793565683,
"loss": 0.2231,
"step": 840
},
{
"epoch": 0.7596067917783735,
"grad_norm": 2.592801809310913,
"learning_rate": 0.00016201966041108133,
"loss": 0.2425,
"step": 850
},
{
"epoch": 0.7685433422698839,
"grad_norm": 5.782228469848633,
"learning_rate": 0.00016157283288650582,
"loss": 0.2184,
"step": 860
},
{
"epoch": 0.7774798927613941,
"grad_norm": 4.794034957885742,
"learning_rate": 0.0001611260053619303,
"loss": 0.1862,
"step": 870
},
{
"epoch": 0.7864164432529044,
"grad_norm": 6.517756462097168,
"learning_rate": 0.0001606791778373548,
"loss": 0.3502,
"step": 880
},
{
"epoch": 0.7953529937444147,
"grad_norm": 6.479066848754883,
"learning_rate": 0.00016023235031277928,
"loss": 0.1433,
"step": 890
},
{
"epoch": 0.8042895442359249,
"grad_norm": 1.539117455482483,
"learning_rate": 0.00015978552278820375,
"loss": 0.3306,
"step": 900
},
{
"epoch": 0.8132260947274352,
"grad_norm": 2.0679945945739746,
"learning_rate": 0.00015933869526362827,
"loss": 0.2325,
"step": 910
},
{
"epoch": 0.8221626452189454,
"grad_norm": 4.1405558586120605,
"learning_rate": 0.00015889186773905274,
"loss": 0.2126,
"step": 920
},
{
"epoch": 0.8310991957104558,
"grad_norm": 3.7805371284484863,
"learning_rate": 0.0001584450402144772,
"loss": 0.2418,
"step": 930
},
{
"epoch": 0.8400357462019661,
"grad_norm": 4.6036248207092285,
"learning_rate": 0.0001579982126899017,
"loss": 0.3191,
"step": 940
},
{
"epoch": 0.8489722966934763,
"grad_norm": 0.8650698661804199,
"learning_rate": 0.0001575513851653262,
"loss": 0.0849,
"step": 950
},
{
"epoch": 0.8579088471849866,
"grad_norm": 0.4226575791835785,
"learning_rate": 0.00015710455764075067,
"loss": 0.1689,
"step": 960
},
{
"epoch": 0.8668453976764968,
"grad_norm": 4.508443355560303,
"learning_rate": 0.00015665773011617516,
"loss": 0.1381,
"step": 970
},
{
"epoch": 0.8757819481680071,
"grad_norm": 5.323261260986328,
"learning_rate": 0.00015621090259159966,
"loss": 0.1799,
"step": 980
},
{
"epoch": 0.8847184986595175,
"grad_norm": 4.80311393737793,
"learning_rate": 0.00015576407506702412,
"loss": 0.1616,
"step": 990
},
{
"epoch": 0.8936550491510277,
"grad_norm": 2.0073227882385254,
"learning_rate": 0.00015531724754244862,
"loss": 0.1814,
"step": 1000
},
{
"epoch": 0.8936550491510277,
"eval_accuracy": 0.9487437185929648,
"eval_loss": 0.17145079374313354,
"eval_runtime": 56.2937,
"eval_samples_per_second": 35.35,
"eval_steps_per_second": 4.423,
"step": 1000
},
{
"epoch": 0.902591599642538,
"grad_norm": 3.7729084491729736,
"learning_rate": 0.00015487042001787312,
"loss": 0.1352,
"step": 1010
},
{
"epoch": 0.9115281501340483,
"grad_norm": 6.341058254241943,
"learning_rate": 0.00015442359249329758,
"loss": 0.2095,
"step": 1020
},
{
"epoch": 0.9204647006255585,
"grad_norm": 0.42624279856681824,
"learning_rate": 0.00015397676496872208,
"loss": 0.2741,
"step": 1030
},
{
"epoch": 0.9294012511170688,
"grad_norm": 4.386059761047363,
"learning_rate": 0.00015352993744414657,
"loss": 0.1688,
"step": 1040
},
{
"epoch": 0.938337801608579,
"grad_norm": 0.07118342816829681,
"learning_rate": 0.00015308310991957104,
"loss": 0.2251,
"step": 1050
},
{
"epoch": 0.9472743521000894,
"grad_norm": 1.6763445138931274,
"learning_rate": 0.00015263628239499554,
"loss": 0.2107,
"step": 1060
},
{
"epoch": 0.9562109025915997,
"grad_norm": 5.161765098571777,
"learning_rate": 0.00015218945487042003,
"loss": 0.1552,
"step": 1070
},
{
"epoch": 0.9651474530831099,
"grad_norm": 1.8887020349502563,
"learning_rate": 0.0001517426273458445,
"loss": 0.259,
"step": 1080
},
{
"epoch": 0.9740840035746202,
"grad_norm": 0.29733049869537354,
"learning_rate": 0.000151295799821269,
"loss": 0.1136,
"step": 1090
},
{
"epoch": 0.9830205540661304,
"grad_norm": 3.253506660461426,
"learning_rate": 0.0001508489722966935,
"loss": 0.1211,
"step": 1100
},
{
"epoch": 0.9919571045576407,
"grad_norm": 2.1613495349884033,
"learning_rate": 0.00015040214477211796,
"loss": 0.1925,
"step": 1110
},
{
"epoch": 1.000893655049151,
"grad_norm": 0.23403897881507874,
"learning_rate": 0.00014995531724754246,
"loss": 0.1099,
"step": 1120
},
{
"epoch": 1.0098302055406614,
"grad_norm": 0.7746050357818604,
"learning_rate": 0.00014950848972296695,
"loss": 0.0627,
"step": 1130
},
{
"epoch": 1.0187667560321716,
"grad_norm": 0.3406558930873871,
"learning_rate": 0.00014906166219839145,
"loss": 0.063,
"step": 1140
},
{
"epoch": 1.0277033065236818,
"grad_norm": 0.12071269750595093,
"learning_rate": 0.00014861483467381591,
"loss": 0.1432,
"step": 1150
},
{
"epoch": 1.0366398570151922,
"grad_norm": 0.4978802800178528,
"learning_rate": 0.00014816800714924038,
"loss": 0.0746,
"step": 1160
},
{
"epoch": 1.0455764075067024,
"grad_norm": 1.3803187608718872,
"learning_rate": 0.0001477211796246649,
"loss": 0.0929,
"step": 1170
},
{
"epoch": 1.0545129579982127,
"grad_norm": 0.8612852692604065,
"learning_rate": 0.00014727435210008937,
"loss": 0.0949,
"step": 1180
},
{
"epoch": 1.063449508489723,
"grad_norm": 0.3493196666240692,
"learning_rate": 0.00014682752457551384,
"loss": 0.168,
"step": 1190
},
{
"epoch": 1.0723860589812333,
"grad_norm": 0.18564535677433014,
"learning_rate": 0.00014638069705093836,
"loss": 0.1344,
"step": 1200
},
{
"epoch": 1.0813226094727435,
"grad_norm": 5.853936672210693,
"learning_rate": 0.00014593386952636283,
"loss": 0.0874,
"step": 1210
},
{
"epoch": 1.0902591599642537,
"grad_norm": 5.366926670074463,
"learning_rate": 0.0001454870420017873,
"loss": 0.0847,
"step": 1220
},
{
"epoch": 1.0991957104557641,
"grad_norm": 0.04646310582756996,
"learning_rate": 0.00014504021447721182,
"loss": 0.0947,
"step": 1230
},
{
"epoch": 1.1081322609472744,
"grad_norm": 0.7593271732330322,
"learning_rate": 0.0001445933869526363,
"loss": 0.1208,
"step": 1240
},
{
"epoch": 1.1170688114387846,
"grad_norm": 6.3901472091674805,
"learning_rate": 0.00014414655942806076,
"loss": 0.0977,
"step": 1250
},
{
"epoch": 1.126005361930295,
"grad_norm": 1.7897100448608398,
"learning_rate": 0.00014369973190348528,
"loss": 0.1107,
"step": 1260
},
{
"epoch": 1.1349419124218052,
"grad_norm": 3.0502521991729736,
"learning_rate": 0.00014325290437890975,
"loss": 0.1854,
"step": 1270
},
{
"epoch": 1.1438784629133154,
"grad_norm": 0.18816685676574707,
"learning_rate": 0.00014280607685433422,
"loss": 0.0863,
"step": 1280
},
{
"epoch": 1.1528150134048256,
"grad_norm": 0.05062058940529823,
"learning_rate": 0.0001423592493297587,
"loss": 0.1339,
"step": 1290
},
{
"epoch": 1.161751563896336,
"grad_norm": 0.23230992257595062,
"learning_rate": 0.0001419124218051832,
"loss": 0.187,
"step": 1300
},
{
"epoch": 1.1706881143878463,
"grad_norm": 3.0492892265319824,
"learning_rate": 0.00014146559428060768,
"loss": 0.0701,
"step": 1310
},
{
"epoch": 1.1796246648793565,
"grad_norm": 0.03424559161067009,
"learning_rate": 0.00014101876675603217,
"loss": 0.0428,
"step": 1320
},
{
"epoch": 1.188561215370867,
"grad_norm": 3.6026527881622314,
"learning_rate": 0.00014057193923145667,
"loss": 0.1337,
"step": 1330
},
{
"epoch": 1.197497765862377,
"grad_norm": 0.09644579142332077,
"learning_rate": 0.00014012511170688114,
"loss": 0.1271,
"step": 1340
},
{
"epoch": 1.2064343163538873,
"grad_norm": 0.22322706878185272,
"learning_rate": 0.00013967828418230563,
"loss": 0.055,
"step": 1350
},
{
"epoch": 1.2153708668453977,
"grad_norm": 0.05372155085206032,
"learning_rate": 0.00013923145665773013,
"loss": 0.1251,
"step": 1360
},
{
"epoch": 1.224307417336908,
"grad_norm": 0.4156355857849121,
"learning_rate": 0.00013878462913315462,
"loss": 0.0405,
"step": 1370
},
{
"epoch": 1.2332439678284182,
"grad_norm": 0.030704261735081673,
"learning_rate": 0.0001383378016085791,
"loss": 0.1225,
"step": 1380
},
{
"epoch": 1.2421805183199286,
"grad_norm": 0.09552694112062454,
"learning_rate": 0.00013789097408400359,
"loss": 0.0495,
"step": 1390
},
{
"epoch": 1.2511170688114388,
"grad_norm": 2.1124463081359863,
"learning_rate": 0.00013744414655942808,
"loss": 0.0651,
"step": 1400
},
{
"epoch": 1.260053619302949,
"grad_norm": 4.6221232414245605,
"learning_rate": 0.00013699731903485255,
"loss": 0.2366,
"step": 1410
},
{
"epoch": 1.2689901697944594,
"grad_norm": 0.054540861397981644,
"learning_rate": 0.00013655049151027704,
"loss": 0.1915,
"step": 1420
},
{
"epoch": 1.2779267202859697,
"grad_norm": 0.6603236198425293,
"learning_rate": 0.00013610366398570154,
"loss": 0.0386,
"step": 1430
},
{
"epoch": 1.2868632707774799,
"grad_norm": 4.419101715087891,
"learning_rate": 0.000135656836461126,
"loss": 0.1288,
"step": 1440
},
{
"epoch": 1.2957998212689903,
"grad_norm": 1.6491079330444336,
"learning_rate": 0.0001352100089365505,
"loss": 0.077,
"step": 1450
},
{
"epoch": 1.3047363717605005,
"grad_norm": 0.904062807559967,
"learning_rate": 0.000134763181411975,
"loss": 0.2083,
"step": 1460
},
{
"epoch": 1.3136729222520107,
"grad_norm": 3.4404361248016357,
"learning_rate": 0.00013431635388739947,
"loss": 0.1846,
"step": 1470
},
{
"epoch": 1.322609472743521,
"grad_norm": 0.2096666842699051,
"learning_rate": 0.00013386952636282396,
"loss": 0.0354,
"step": 1480
},
{
"epoch": 1.3315460232350314,
"grad_norm": 4.2826128005981445,
"learning_rate": 0.00013342269883824846,
"loss": 0.1438,
"step": 1490
},
{
"epoch": 1.3404825737265416,
"grad_norm": 4.742111682891846,
"learning_rate": 0.00013297587131367293,
"loss": 0.0994,
"step": 1500
},
{
"epoch": 1.3494191242180518,
"grad_norm": 6.2931952476501465,
"learning_rate": 0.0001325290437890974,
"loss": 0.0754,
"step": 1510
},
{
"epoch": 1.358355674709562,
"grad_norm": 1.523571491241455,
"learning_rate": 0.00013208221626452192,
"loss": 0.1283,
"step": 1520
},
{
"epoch": 1.3672922252010724,
"grad_norm": 8.253166198730469,
"learning_rate": 0.00013163538873994638,
"loss": 0.1718,
"step": 1530
},
{
"epoch": 1.3762287756925826,
"grad_norm": 2.4168646335601807,
"learning_rate": 0.00013118856121537085,
"loss": 0.1285,
"step": 1540
},
{
"epoch": 1.3851653261840928,
"grad_norm": 4.069122314453125,
"learning_rate": 0.00013074173369079537,
"loss": 0.1165,
"step": 1550
},
{
"epoch": 1.3941018766756033,
"grad_norm": 0.2789513170719147,
"learning_rate": 0.00013029490616621984,
"loss": 0.0795,
"step": 1560
},
{
"epoch": 1.4030384271671135,
"grad_norm": 0.5609318017959595,
"learning_rate": 0.0001298480786416443,
"loss": 0.1187,
"step": 1570
},
{
"epoch": 1.4119749776586237,
"grad_norm": 0.34373611211776733,
"learning_rate": 0.00012940125111706883,
"loss": 0.0872,
"step": 1580
},
{
"epoch": 1.420911528150134,
"grad_norm": 4.596048355102539,
"learning_rate": 0.0001289544235924933,
"loss": 0.1354,
"step": 1590
},
{
"epoch": 1.4298480786416443,
"grad_norm": 0.06107456609606743,
"learning_rate": 0.00012850759606791777,
"loss": 0.119,
"step": 1600
},
{
"epoch": 1.4387846291331545,
"grad_norm": 0.08292512595653534,
"learning_rate": 0.0001280607685433423,
"loss": 0.1075,
"step": 1610
},
{
"epoch": 1.447721179624665,
"grad_norm": 0.04113980755209923,
"learning_rate": 0.00012761394101876676,
"loss": 0.099,
"step": 1620
},
{
"epoch": 1.4566577301161752,
"grad_norm": 3.1171679496765137,
"learning_rate": 0.00012716711349419126,
"loss": 0.0476,
"step": 1630
},
{
"epoch": 1.4655942806076854,
"grad_norm": 0.03248828276991844,
"learning_rate": 0.00012672028596961572,
"loss": 0.1217,
"step": 1640
},
{
"epoch": 1.4745308310991958,
"grad_norm": 0.14615251123905182,
"learning_rate": 0.00012627345844504022,
"loss": 0.0845,
"step": 1650
},
{
"epoch": 1.483467381590706,
"grad_norm": 0.8569982647895813,
"learning_rate": 0.00012582663092046471,
"loss": 0.0933,
"step": 1660
},
{
"epoch": 1.4924039320822162,
"grad_norm": 0.030800212174654007,
"learning_rate": 0.00012537980339588918,
"loss": 0.0555,
"step": 1670
},
{
"epoch": 1.5013404825737267,
"grad_norm": 0.9634251594543457,
"learning_rate": 0.00012493297587131368,
"loss": 0.1249,
"step": 1680
},
{
"epoch": 1.5102770330652369,
"grad_norm": 0.06999039649963379,
"learning_rate": 0.00012448614834673817,
"loss": 0.0727,
"step": 1690
},
{
"epoch": 1.519213583556747,
"grad_norm": 0.0438673160970211,
"learning_rate": 0.00012403932082216264,
"loss": 0.0595,
"step": 1700
},
{
"epoch": 1.5281501340482575,
"grad_norm": 0.030631419271230698,
"learning_rate": 0.00012359249329758714,
"loss": 0.0641,
"step": 1710
},
{
"epoch": 1.5370866845397675,
"grad_norm": 0.09066120535135269,
"learning_rate": 0.00012314566577301163,
"loss": 0.0689,
"step": 1720
},
{
"epoch": 1.546023235031278,
"grad_norm": 1.1478157043457031,
"learning_rate": 0.0001226988382484361,
"loss": 0.0427,
"step": 1730
},
{
"epoch": 1.5549597855227884,
"grad_norm": 0.5382466912269592,
"learning_rate": 0.0001222520107238606,
"loss": 0.1211,
"step": 1740
},
{
"epoch": 1.5638963360142983,
"grad_norm": 0.15291939675807953,
"learning_rate": 0.00012180518319928509,
"loss": 0.1934,
"step": 1750
},
{
"epoch": 1.5728328865058088,
"grad_norm": 0.07158921658992767,
"learning_rate": 0.00012135835567470957,
"loss": 0.045,
"step": 1760
},
{
"epoch": 1.5817694369973192,
"grad_norm": 1.416129469871521,
"learning_rate": 0.00012091152815013404,
"loss": 0.0822,
"step": 1770
},
{
"epoch": 1.5907059874888292,
"grad_norm": 3.2841928005218506,
"learning_rate": 0.00012046470062555855,
"loss": 0.0685,
"step": 1780
},
{
"epoch": 1.5996425379803396,
"grad_norm": 5.683614730834961,
"learning_rate": 0.00012001787310098302,
"loss": 0.1512,
"step": 1790
},
{
"epoch": 1.6085790884718498,
"grad_norm": 0.054330743849277496,
"learning_rate": 0.0001195710455764075,
"loss": 0.1381,
"step": 1800
},
{
"epoch": 1.61751563896336,
"grad_norm": 0.05368073284626007,
"learning_rate": 0.00011912421805183201,
"loss": 0.1118,
"step": 1810
},
{
"epoch": 1.6264521894548705,
"grad_norm": 17.735898971557617,
"learning_rate": 0.00011867739052725648,
"loss": 0.1704,
"step": 1820
},
{
"epoch": 1.6353887399463807,
"grad_norm": 3.4387574195861816,
"learning_rate": 0.00011823056300268096,
"loss": 0.1498,
"step": 1830
},
{
"epoch": 1.6443252904378909,
"grad_norm": 3.4959723949432373,
"learning_rate": 0.00011778373547810547,
"loss": 0.0667,
"step": 1840
},
{
"epoch": 1.6532618409294013,
"grad_norm": 1.4753037691116333,
"learning_rate": 0.00011733690795352994,
"loss": 0.0445,
"step": 1850
},
{
"epoch": 1.6621983914209115,
"grad_norm": 0.24579989910125732,
"learning_rate": 0.00011689008042895442,
"loss": 0.0377,
"step": 1860
},
{
"epoch": 1.6711349419124217,
"grad_norm": 3.813619375228882,
"learning_rate": 0.00011644325290437891,
"loss": 0.1004,
"step": 1870
},
{
"epoch": 1.6800714924039322,
"grad_norm": 0.808028519153595,
"learning_rate": 0.0001159964253798034,
"loss": 0.0679,
"step": 1880
},
{
"epoch": 1.6890080428954424,
"grad_norm": 0.277228444814682,
"learning_rate": 0.0001155495978552279,
"loss": 0.1096,
"step": 1890
},
{
"epoch": 1.6979445933869526,
"grad_norm": 2.485595703125,
"learning_rate": 0.00011510277033065237,
"loss": 0.0738,
"step": 1900
},
{
"epoch": 1.706881143878463,
"grad_norm": 0.35362759232521057,
"learning_rate": 0.00011465594280607685,
"loss": 0.0807,
"step": 1910
},
{
"epoch": 1.7158176943699732,
"grad_norm": 1.7707135677337646,
"learning_rate": 0.00011420911528150135,
"loss": 0.0603,
"step": 1920
},
{
"epoch": 1.7247542448614834,
"grad_norm": 0.010053984820842743,
"learning_rate": 0.00011376228775692583,
"loss": 0.0142,
"step": 1930
},
{
"epoch": 1.7336907953529939,
"grad_norm": 11.442891120910645,
"learning_rate": 0.00011331546023235031,
"loss": 0.0509,
"step": 1940
},
{
"epoch": 1.742627345844504,
"grad_norm": 2.5633316040039062,
"learning_rate": 0.00011286863270777481,
"loss": 0.0204,
"step": 1950
},
{
"epoch": 1.7515638963360143,
"grad_norm": 0.9002701044082642,
"learning_rate": 0.00011242180518319929,
"loss": 0.0822,
"step": 1960
},
{
"epoch": 1.7605004468275247,
"grad_norm": 0.03169967234134674,
"learning_rate": 0.00011197497765862377,
"loss": 0.0951,
"step": 1970
},
{
"epoch": 1.7694369973190347,
"grad_norm": 0.07693292945623398,
"learning_rate": 0.00011152815013404827,
"loss": 0.11,
"step": 1980
},
{
"epoch": 1.7783735478105451,
"grad_norm": 0.06315601617097855,
"learning_rate": 0.00011108132260947275,
"loss": 0.1217,
"step": 1990
},
{
"epoch": 1.7873100983020556,
"grad_norm": 0.26389381289482117,
"learning_rate": 0.00011063449508489723,
"loss": 0.1077,
"step": 2000
},
{
"epoch": 1.7873100983020556,
"eval_accuracy": 0.9668341708542714,
"eval_loss": 0.12829196453094482,
"eval_runtime": 56.3081,
"eval_samples_per_second": 35.341,
"eval_steps_per_second": 4.422,
"step": 2000
},
{
"epoch": 1.7962466487935655,
"grad_norm": 0.14058926701545715,
"learning_rate": 0.00011018766756032173,
"loss": 0.051,
"step": 2010
},
{
"epoch": 1.805183199285076,
"grad_norm": 0.8464193940162659,
"learning_rate": 0.00010974084003574621,
"loss": 0.0557,
"step": 2020
},
{
"epoch": 1.8141197497765862,
"grad_norm": 0.5524567365646362,
"learning_rate": 0.00010929401251117069,
"loss": 0.0327,
"step": 2030
},
{
"epoch": 1.8230563002680964,
"grad_norm": 4.706042289733887,
"learning_rate": 0.00010884718498659518,
"loss": 0.0815,
"step": 2040
},
{
"epoch": 1.8319928507596068,
"grad_norm": 5.365744113922119,
"learning_rate": 0.00010840035746201967,
"loss": 0.0617,
"step": 2050
},
{
"epoch": 1.840929401251117,
"grad_norm": 1.1039865016937256,
"learning_rate": 0.00010795352993744415,
"loss": 0.0528,
"step": 2060
},
{
"epoch": 1.8498659517426272,
"grad_norm": 2.8230929374694824,
"learning_rate": 0.00010750670241286864,
"loss": 0.0534,
"step": 2070
},
{
"epoch": 1.8588025022341377,
"grad_norm": 0.02104310691356659,
"learning_rate": 0.00010705987488829313,
"loss": 0.1058,
"step": 2080
},
{
"epoch": 1.8677390527256479,
"grad_norm": 0.030116664245724678,
"learning_rate": 0.0001066130473637176,
"loss": 0.0971,
"step": 2090
},
{
"epoch": 1.876675603217158,
"grad_norm": 0.5036576986312866,
"learning_rate": 0.0001061662198391421,
"loss": 0.0693,
"step": 2100
},
{
"epoch": 1.8856121537086685,
"grad_norm": 4.131002426147461,
"learning_rate": 0.00010571939231456658,
"loss": 0.0933,
"step": 2110
},
{
"epoch": 1.8945487042001787,
"grad_norm": 5.004481792449951,
"learning_rate": 0.00010527256478999108,
"loss": 0.0698,
"step": 2120
},
{
"epoch": 1.903485254691689,
"grad_norm": 0.014153541065752506,
"learning_rate": 0.00010482573726541556,
"loss": 0.0598,
"step": 2130
},
{
"epoch": 1.9124218051831994,
"grad_norm": 0.39952540397644043,
"learning_rate": 0.00010437890974084004,
"loss": 0.1169,
"step": 2140
},
{
"epoch": 1.9213583556747096,
"grad_norm": 5.047325611114502,
"learning_rate": 0.00010393208221626454,
"loss": 0.1492,
"step": 2150
},
{
"epoch": 1.9302949061662198,
"grad_norm": 0.045367881655693054,
"learning_rate": 0.00010348525469168902,
"loss": 0.081,
"step": 2160
},
{
"epoch": 1.9392314566577302,
"grad_norm": 0.02820589952170849,
"learning_rate": 0.00010303842716711349,
"loss": 0.1456,
"step": 2170
},
{
"epoch": 1.9481680071492404,
"grad_norm": 0.15606756508350372,
"learning_rate": 0.000102591599642538,
"loss": 0.0484,
"step": 2180
},
{
"epoch": 1.9571045576407506,
"grad_norm": 4.374292850494385,
"learning_rate": 0.00010214477211796248,
"loss": 0.1133,
"step": 2190
},
{
"epoch": 1.966041108132261,
"grad_norm": 0.6300436854362488,
"learning_rate": 0.00010169794459338695,
"loss": 0.0159,
"step": 2200
},
{
"epoch": 1.974977658623771,
"grad_norm": 0.011597417294979095,
"learning_rate": 0.00010125111706881146,
"loss": 0.019,
"step": 2210
},
{
"epoch": 1.9839142091152815,
"grad_norm": 0.013629280962049961,
"learning_rate": 0.00010080428954423592,
"loss": 0.0953,
"step": 2220
},
{
"epoch": 1.992850759606792,
"grad_norm": 4.461750030517578,
"learning_rate": 0.0001003574620196604,
"loss": 0.1169,
"step": 2230
},
{
"epoch": 2.001787310098302,
"grad_norm": 0.2028690129518509,
"learning_rate": 9.99106344950849e-05,
"loss": 0.0515,
"step": 2240
},
{
"epoch": 2.0107238605898123,
"grad_norm": 0.683179497718811,
"learning_rate": 9.946380697050938e-05,
"loss": 0.0414,
"step": 2250
},
{
"epoch": 2.0196604110813228,
"grad_norm": 0.3097274601459503,
"learning_rate": 9.901697944593388e-05,
"loss": 0.013,
"step": 2260
},
{
"epoch": 2.0285969615728328,
"grad_norm": 0.02391964942216873,
"learning_rate": 9.857015192135836e-05,
"loss": 0.0143,
"step": 2270
},
{
"epoch": 2.037533512064343,
"grad_norm": 0.02549424022436142,
"learning_rate": 9.812332439678284e-05,
"loss": 0.0321,
"step": 2280
},
{
"epoch": 2.0464700625558536,
"grad_norm": 0.015907390043139458,
"learning_rate": 9.767649687220734e-05,
"loss": 0.0905,
"step": 2290
},
{
"epoch": 2.0554066130473636,
"grad_norm": 0.04600854963064194,
"learning_rate": 9.722966934763182e-05,
"loss": 0.0055,
"step": 2300
},
{
"epoch": 2.064343163538874,
"grad_norm": 0.17837274074554443,
"learning_rate": 9.67828418230563e-05,
"loss": 0.0792,
"step": 2310
},
{
"epoch": 2.0732797140303845,
"grad_norm": 0.678176760673523,
"learning_rate": 9.63360142984808e-05,
"loss": 0.1025,
"step": 2320
},
{
"epoch": 2.0822162645218945,
"grad_norm": 0.047438375651836395,
"learning_rate": 9.588918677390528e-05,
"loss": 0.0037,
"step": 2330
},
{
"epoch": 2.091152815013405,
"grad_norm": 0.3825267553329468,
"learning_rate": 9.544235924932976e-05,
"loss": 0.0271,
"step": 2340
},
{
"epoch": 2.1000893655049153,
"grad_norm": 0.022976990789175034,
"learning_rate": 9.499553172475425e-05,
"loss": 0.0055,
"step": 2350
},
{
"epoch": 2.1090259159964253,
"grad_norm": 0.21945427358150482,
"learning_rate": 9.454870420017874e-05,
"loss": 0.0072,
"step": 2360
},
{
"epoch": 2.1179624664879357,
"grad_norm": 0.020401885733008385,
"learning_rate": 9.410187667560322e-05,
"loss": 0.0045,
"step": 2370
},
{
"epoch": 2.126899016979446,
"grad_norm": 0.3614647388458252,
"learning_rate": 9.365504915102771e-05,
"loss": 0.0292,
"step": 2380
},
{
"epoch": 2.135835567470956,
"grad_norm": 0.01699133589863777,
"learning_rate": 9.32082216264522e-05,
"loss": 0.0728,
"step": 2390
},
{
"epoch": 2.1447721179624666,
"grad_norm": 0.012751326896250248,
"learning_rate": 9.276139410187668e-05,
"loss": 0.0358,
"step": 2400
},
{
"epoch": 2.1537086684539766,
"grad_norm": 0.009738125838339329,
"learning_rate": 9.231456657730116e-05,
"loss": 0.0415,
"step": 2410
},
{
"epoch": 2.162645218945487,
"grad_norm": 0.012577983550727367,
"learning_rate": 9.186773905272565e-05,
"loss": 0.0204,
"step": 2420
},
{
"epoch": 2.1715817694369974,
"grad_norm": 0.022706875577569008,
"learning_rate": 9.142091152815015e-05,
"loss": 0.0391,
"step": 2430
},
{
"epoch": 2.1805183199285074,
"grad_norm": 1.2650375366210938,
"learning_rate": 9.097408400357462e-05,
"loss": 0.005,
"step": 2440
},
{
"epoch": 2.189454870420018,
"grad_norm": 0.012098530307412148,
"learning_rate": 9.052725647899911e-05,
"loss": 0.0631,
"step": 2450
},
{
"epoch": 2.1983914209115283,
"grad_norm": 0.014217260293662548,
"learning_rate": 9.00804289544236e-05,
"loss": 0.0158,
"step": 2460
},
{
"epoch": 2.2073279714030383,
"grad_norm": 9.968586921691895,
"learning_rate": 8.963360142984808e-05,
"loss": 0.0338,
"step": 2470
},
{
"epoch": 2.2162645218945487,
"grad_norm": 0.008608737029135227,
"learning_rate": 8.918677390527257e-05,
"loss": 0.0344,
"step": 2480
},
{
"epoch": 2.225201072386059,
"grad_norm": 0.0957435816526413,
"learning_rate": 8.873994638069705e-05,
"loss": 0.0346,
"step": 2490
},
{
"epoch": 2.234137622877569,
"grad_norm": 0.009171651676297188,
"learning_rate": 8.829311885612154e-05,
"loss": 0.0534,
"step": 2500
},
{
"epoch": 2.2430741733690795,
"grad_norm": 0.025571748614311218,
"learning_rate": 8.784629133154603e-05,
"loss": 0.0046,
"step": 2510
},
{
"epoch": 2.25201072386059,
"grad_norm": 0.008803543634712696,
"learning_rate": 8.739946380697051e-05,
"loss": 0.0104,
"step": 2520
},
{
"epoch": 2.2609472743521,
"grad_norm": 0.009746580384671688,
"learning_rate": 8.6952636282395e-05,
"loss": 0.0194,
"step": 2530
},
{
"epoch": 2.2698838248436104,
"grad_norm": 4.104613780975342,
"learning_rate": 8.650580875781949e-05,
"loss": 0.0155,
"step": 2540
},
{
"epoch": 2.278820375335121,
"grad_norm": 0.01826513558626175,
"learning_rate": 8.605898123324397e-05,
"loss": 0.0072,
"step": 2550
},
{
"epoch": 2.287756925826631,
"grad_norm": 0.03380773961544037,
"learning_rate": 8.561215370866847e-05,
"loss": 0.0515,
"step": 2560
},
{
"epoch": 2.2966934763181412,
"grad_norm": 0.13917675614356995,
"learning_rate": 8.516532618409293e-05,
"loss": 0.0553,
"step": 2570
},
{
"epoch": 2.3056300268096512,
"grad_norm": 3.9170970916748047,
"learning_rate": 8.471849865951743e-05,
"loss": 0.0252,
"step": 2580
},
{
"epoch": 2.3145665773011617,
"grad_norm": 0.02010478265583515,
"learning_rate": 8.427167113494193e-05,
"loss": 0.0212,
"step": 2590
},
{
"epoch": 2.323503127792672,
"grad_norm": 0.008358313702046871,
"learning_rate": 8.38248436103664e-05,
"loss": 0.1032,
"step": 2600
},
{
"epoch": 2.3324396782841825,
"grad_norm": 0.08038530498743057,
"learning_rate": 8.337801608579089e-05,
"loss": 0.0445,
"step": 2610
},
{
"epoch": 2.3413762287756925,
"grad_norm": 0.03653928264975548,
"learning_rate": 8.293118856121538e-05,
"loss": 0.0396,
"step": 2620
},
{
"epoch": 2.350312779267203,
"grad_norm": 0.027160342782735825,
"learning_rate": 8.248436103663985e-05,
"loss": 0.0305,
"step": 2630
},
{
"epoch": 2.359249329758713,
"grad_norm": 0.015198041684925556,
"learning_rate": 8.203753351206435e-05,
"loss": 0.0377,
"step": 2640
},
{
"epoch": 2.3681858802502234,
"grad_norm": 0.03799434006214142,
"learning_rate": 8.159070598748883e-05,
"loss": 0.0057,
"step": 2650
},
{
"epoch": 2.377122430741734,
"grad_norm": 0.008046945556998253,
"learning_rate": 8.114387846291331e-05,
"loss": 0.0249,
"step": 2660
},
{
"epoch": 2.386058981233244,
"grad_norm": 8.727446556091309,
"learning_rate": 8.069705093833781e-05,
"loss": 0.0466,
"step": 2670
},
{
"epoch": 2.394995531724754,
"grad_norm": 0.01986142434179783,
"learning_rate": 8.025022341376229e-05,
"loss": 0.0357,
"step": 2680
},
{
"epoch": 2.4039320822162646,
"grad_norm": 7.71134614944458,
"learning_rate": 7.980339588918678e-05,
"loss": 0.015,
"step": 2690
},
{
"epoch": 2.4128686327077746,
"grad_norm": 0.04247535765171051,
"learning_rate": 7.935656836461127e-05,
"loss": 0.0165,
"step": 2700
},
{
"epoch": 2.421805183199285,
"grad_norm": 0.008588094264268875,
"learning_rate": 7.890974084003575e-05,
"loss": 0.0039,
"step": 2710
},
{
"epoch": 2.4307417336907955,
"grad_norm": 0.11789193749427795,
"learning_rate": 7.846291331546024e-05,
"loss": 0.0344,
"step": 2720
},
{
"epoch": 2.4396782841823055,
"grad_norm": 0.02231294848024845,
"learning_rate": 7.801608579088472e-05,
"loss": 0.0248,
"step": 2730
},
{
"epoch": 2.448614834673816,
"grad_norm": 0.017268147319555283,
"learning_rate": 7.75692582663092e-05,
"loss": 0.0716,
"step": 2740
},
{
"epoch": 2.4575513851653263,
"grad_norm": 8.963982582092285,
"learning_rate": 7.71224307417337e-05,
"loss": 0.0282,
"step": 2750
},
{
"epoch": 2.4664879356568363,
"grad_norm": 0.799085259437561,
"learning_rate": 7.667560321715817e-05,
"loss": 0.0416,
"step": 2760
},
{
"epoch": 2.4754244861483468,
"grad_norm": 0.15468931198120117,
"learning_rate": 7.622877569258267e-05,
"loss": 0.0669,
"step": 2770
},
{
"epoch": 2.484361036639857,
"grad_norm": 3.4924068450927734,
"learning_rate": 7.578194816800716e-05,
"loss": 0.0477,
"step": 2780
},
{
"epoch": 2.493297587131367,
"grad_norm": 0.012834394350647926,
"learning_rate": 7.533512064343163e-05,
"loss": 0.0174,
"step": 2790
},
{
"epoch": 2.5022341376228776,
"grad_norm": 0.039204515516757965,
"learning_rate": 7.488829311885612e-05,
"loss": 0.0699,
"step": 2800
},
{
"epoch": 2.5111706881143876,
"grad_norm": 0.08284445852041245,
"learning_rate": 7.444146559428062e-05,
"loss": 0.0445,
"step": 2810
},
{
"epoch": 2.520107238605898,
"grad_norm": 0.010827134363353252,
"learning_rate": 7.39946380697051e-05,
"loss": 0.043,
"step": 2820
},
{
"epoch": 2.5290437890974085,
"grad_norm": 3.5454938411712646,
"learning_rate": 7.354781054512958e-05,
"loss": 0.0339,
"step": 2830
},
{
"epoch": 2.537980339588919,
"grad_norm": 0.006842234171926975,
"learning_rate": 7.310098302055406e-05,
"loss": 0.0029,
"step": 2840
},
{
"epoch": 2.546916890080429,
"grad_norm": 0.7790193557739258,
"learning_rate": 7.265415549597856e-05,
"loss": 0.0055,
"step": 2850
},
{
"epoch": 2.5558534405719393,
"grad_norm": 0.022239111363887787,
"learning_rate": 7.220732797140304e-05,
"loss": 0.008,
"step": 2860
},
{
"epoch": 2.5647899910634493,
"grad_norm": 0.05403418838977814,
"learning_rate": 7.176050044682752e-05,
"loss": 0.057,
"step": 2870
},
{
"epoch": 2.5737265415549597,
"grad_norm": 0.008923870511353016,
"learning_rate": 7.131367292225202e-05,
"loss": 0.0045,
"step": 2880
},
{
"epoch": 2.58266309204647,
"grad_norm": 0.02668040059506893,
"learning_rate": 7.08668453976765e-05,
"loss": 0.0551,
"step": 2890
},
{
"epoch": 2.5915996425379806,
"grad_norm": 0.049835577607154846,
"learning_rate": 7.042001787310098e-05,
"loss": 0.0255,
"step": 2900
},
{
"epoch": 2.6005361930294906,
"grad_norm": 0.19334334135055542,
"learning_rate": 6.997319034852548e-05,
"loss": 0.0434,
"step": 2910
},
{
"epoch": 2.609472743521001,
"grad_norm": 2.9139554500579834,
"learning_rate": 6.952636282394996e-05,
"loss": 0.0069,
"step": 2920
},
{
"epoch": 2.618409294012511,
"grad_norm": 0.006679228041321039,
"learning_rate": 6.907953529937444e-05,
"loss": 0.0021,
"step": 2930
},
{
"epoch": 2.6273458445040214,
"grad_norm": 0.1680416613817215,
"learning_rate": 6.863270777479894e-05,
"loss": 0.0249,
"step": 2940
},
{
"epoch": 2.636282394995532,
"grad_norm": 0.08290654420852661,
"learning_rate": 6.818588025022342e-05,
"loss": 0.029,
"step": 2950
},
{
"epoch": 2.645218945487042,
"grad_norm": 0.013707391917705536,
"learning_rate": 6.77390527256479e-05,
"loss": 0.0124,
"step": 2960
},
{
"epoch": 2.6541554959785523,
"grad_norm": 0.2275378704071045,
"learning_rate": 6.72922252010724e-05,
"loss": 0.035,
"step": 2970
},
{
"epoch": 2.6630920464700627,
"grad_norm": 0.5669155716896057,
"learning_rate": 6.684539767649688e-05,
"loss": 0.0288,
"step": 2980
},
{
"epoch": 2.6720285969615727,
"grad_norm": 0.01488091703504324,
"learning_rate": 6.639857015192136e-05,
"loss": 0.0438,
"step": 2990
},
{
"epoch": 2.680965147453083,
"grad_norm": 3.9659953117370605,
"learning_rate": 6.595174262734584e-05,
"loss": 0.0652,
"step": 3000
},
{
"epoch": 2.680965147453083,
"eval_accuracy": 0.9793969849246231,
"eval_loss": 0.08239442110061646,
"eval_runtime": 56.1213,
"eval_samples_per_second": 35.459,
"eval_steps_per_second": 4.437,
"step": 3000
},
{
"epoch": 2.6899016979445936,
"grad_norm": 8.31395149230957,
"learning_rate": 6.550491510277034e-05,
"loss": 0.0098,
"step": 3010
},
{
"epoch": 2.6988382484361035,
"grad_norm": 0.008468572981655598,
"learning_rate": 6.505808757819482e-05,
"loss": 0.1056,
"step": 3020
},
{
"epoch": 2.707774798927614,
"grad_norm": 0.9328808188438416,
"learning_rate": 6.46112600536193e-05,
"loss": 0.0769,
"step": 3030
},
{
"epoch": 2.716711349419124,
"grad_norm": 0.6114912629127502,
"learning_rate": 6.41644325290438e-05,
"loss": 0.0434,
"step": 3040
},
{
"epoch": 2.7256478999106344,
"grad_norm": 0.03709472343325615,
"learning_rate": 6.371760500446829e-05,
"loss": 0.0166,
"step": 3050
},
{
"epoch": 2.734584450402145,
"grad_norm": 0.1086587980389595,
"learning_rate": 6.327077747989276e-05,
"loss": 0.0047,
"step": 3060
},
{
"epoch": 2.7435210008936552,
"grad_norm": 0.12008140981197357,
"learning_rate": 6.282394995531725e-05,
"loss": 0.0069,
"step": 3070
},
{
"epoch": 2.7524575513851652,
"grad_norm": 0.017355024814605713,
"learning_rate": 6.237712243074174e-05,
"loss": 0.0033,
"step": 3080
},
{
"epoch": 2.7613941018766757,
"grad_norm": 0.15070508420467377,
"learning_rate": 6.193029490616622e-05,
"loss": 0.0476,
"step": 3090
},
{
"epoch": 2.7703306523681857,
"grad_norm": 0.022527649998664856,
"learning_rate": 6.148346738159071e-05,
"loss": 0.0243,
"step": 3100
},
{
"epoch": 2.779267202859696,
"grad_norm": 0.37779930233955383,
"learning_rate": 6.10366398570152e-05,
"loss": 0.0058,
"step": 3110
},
{
"epoch": 2.7882037533512065,
"grad_norm": 0.029893942177295685,
"learning_rate": 6.0589812332439676e-05,
"loss": 0.0208,
"step": 3120
},
{
"epoch": 2.797140303842717,
"grad_norm": 0.01635076478123665,
"learning_rate": 6.0142984807864165e-05,
"loss": 0.0026,
"step": 3130
},
{
"epoch": 2.806076854334227,
"grad_norm": 0.011868173256516457,
"learning_rate": 5.969615728328865e-05,
"loss": 0.0257,
"step": 3140
},
{
"epoch": 2.8150134048257374,
"grad_norm": 0.02559722028672695,
"learning_rate": 5.9249329758713135e-05,
"loss": 0.0666,
"step": 3150
},
{
"epoch": 2.8239499553172474,
"grad_norm": 0.01763424649834633,
"learning_rate": 5.8802502234137623e-05,
"loss": 0.0611,
"step": 3160
},
{
"epoch": 2.832886505808758,
"grad_norm": 0.02686423808336258,
"learning_rate": 5.835567470956211e-05,
"loss": 0.0039,
"step": 3170
},
{
"epoch": 2.841823056300268,
"grad_norm": 0.04632404074072838,
"learning_rate": 5.79088471849866e-05,
"loss": 0.0122,
"step": 3180
},
{
"epoch": 2.8507596067917786,
"grad_norm": 0.1586790531873703,
"learning_rate": 5.746201966041108e-05,
"loss": 0.0026,
"step": 3190
},
{
"epoch": 2.8596961572832886,
"grad_norm": 5.425605297088623,
"learning_rate": 5.701519213583557e-05,
"loss": 0.0622,
"step": 3200
},
{
"epoch": 2.868632707774799,
"grad_norm": 0.006181008648127317,
"learning_rate": 5.656836461126006e-05,
"loss": 0.0028,
"step": 3210
},
{
"epoch": 2.877569258266309,
"grad_norm": 0.09517185389995575,
"learning_rate": 5.612153708668454e-05,
"loss": 0.0035,
"step": 3220
},
{
"epoch": 2.8865058087578195,
"grad_norm": 0.015022194012999535,
"learning_rate": 5.567470956210903e-05,
"loss": 0.0285,
"step": 3230
},
{
"epoch": 2.89544235924933,
"grad_norm": 4.772485256195068,
"learning_rate": 5.522788203753352e-05,
"loss": 0.0279,
"step": 3240
},
{
"epoch": 2.90437890974084,
"grad_norm": 3.1032145023345947,
"learning_rate": 5.478105451295799e-05,
"loss": 0.0049,
"step": 3250
},
{
"epoch": 2.9133154602323503,
"grad_norm": 0.05868244543671608,
"learning_rate": 5.433422698838249e-05,
"loss": 0.0029,
"step": 3260
},
{
"epoch": 2.9222520107238603,
"grad_norm": 0.008307090029120445,
"learning_rate": 5.388739946380698e-05,
"loss": 0.0099,
"step": 3270
},
{
"epoch": 2.9311885612153707,
"grad_norm": 0.010392882861196995,
"learning_rate": 5.344057193923145e-05,
"loss": 0.002,
"step": 3280
},
{
"epoch": 2.940125111706881,
"grad_norm": 0.005523020401597023,
"learning_rate": 5.299374441465594e-05,
"loss": 0.0035,
"step": 3290
},
{
"epoch": 2.9490616621983916,
"grad_norm": 0.06098335236310959,
"learning_rate": 5.2546916890080436e-05,
"loss": 0.0056,
"step": 3300
},
{
"epoch": 2.9579982126899016,
"grad_norm": 0.013083376921713352,
"learning_rate": 5.2100089365504925e-05,
"loss": 0.023,
"step": 3310
},
{
"epoch": 2.966934763181412,
"grad_norm": 0.01605415530502796,
"learning_rate": 5.16532618409294e-05,
"loss": 0.0396,
"step": 3320
},
{
"epoch": 2.975871313672922,
"grad_norm": 0.013243346475064754,
"learning_rate": 5.120643431635389e-05,
"loss": 0.0083,
"step": 3330
},
{
"epoch": 2.9848078641644324,
"grad_norm": 1.4108890295028687,
"learning_rate": 5.0759606791778383e-05,
"loss": 0.0468,
"step": 3340
},
{
"epoch": 2.993744414655943,
"grad_norm": 0.5704414248466492,
"learning_rate": 5.031277926720286e-05,
"loss": 0.0209,
"step": 3350
},
{
"epoch": 3.002680965147453,
"grad_norm": 0.03908452019095421,
"learning_rate": 4.986595174262735e-05,
"loss": 0.0779,
"step": 3360
},
{
"epoch": 3.0116175156389633,
"grad_norm": 0.010959290899336338,
"learning_rate": 4.9419124218051835e-05,
"loss": 0.0109,
"step": 3370
},
{
"epoch": 3.0205540661304737,
"grad_norm": 0.028490547090768814,
"learning_rate": 4.8972296693476324e-05,
"loss": 0.0025,
"step": 3380
},
{
"epoch": 3.0294906166219837,
"grad_norm": 0.00491972966119647,
"learning_rate": 4.8525469168900806e-05,
"loss": 0.0214,
"step": 3390
},
{
"epoch": 3.038427167113494,
"grad_norm": 0.014270992018282413,
"learning_rate": 4.8078641644325294e-05,
"loss": 0.0048,
"step": 3400
},
{
"epoch": 3.0473637176050046,
"grad_norm": 0.00458119623363018,
"learning_rate": 4.7631814119749776e-05,
"loss": 0.0308,
"step": 3410
},
{
"epoch": 3.0563002680965146,
"grad_norm": 0.00890402402728796,
"learning_rate": 4.7184986595174265e-05,
"loss": 0.0019,
"step": 3420
},
{
"epoch": 3.065236818588025,
"grad_norm": 0.004751246422529221,
"learning_rate": 4.673815907059875e-05,
"loss": 0.0028,
"step": 3430
},
{
"epoch": 3.0741733690795354,
"grad_norm": 0.008143426850438118,
"learning_rate": 4.6291331546023235e-05,
"loss": 0.0015,
"step": 3440
},
{
"epoch": 3.0831099195710454,
"grad_norm": 0.035306982696056366,
"learning_rate": 4.5844504021447723e-05,
"loss": 0.0561,
"step": 3450
},
{
"epoch": 3.092046470062556,
"grad_norm": 0.006312028504908085,
"learning_rate": 4.539767649687221e-05,
"loss": 0.0407,
"step": 3460
},
{
"epoch": 3.1009830205540663,
"grad_norm": 0.012918233871459961,
"learning_rate": 4.4950848972296694e-05,
"loss": 0.0204,
"step": 3470
},
{
"epoch": 3.1099195710455763,
"grad_norm": 0.03429726883769035,
"learning_rate": 4.450402144772118e-05,
"loss": 0.0138,
"step": 3480
},
{
"epoch": 3.1188561215370867,
"grad_norm": 0.032142043113708496,
"learning_rate": 4.405719392314567e-05,
"loss": 0.0074,
"step": 3490
},
{
"epoch": 3.127792672028597,
"grad_norm": 0.11621160060167313,
"learning_rate": 4.361036639857015e-05,
"loss": 0.007,
"step": 3500
},
{
"epoch": 3.136729222520107,
"grad_norm": 0.010225760750472546,
"learning_rate": 4.316353887399464e-05,
"loss": 0.0371,
"step": 3510
},
{
"epoch": 3.1456657730116175,
"grad_norm": 0.0270242840051651,
"learning_rate": 4.271671134941912e-05,
"loss": 0.0024,
"step": 3520
},
{
"epoch": 3.154602323503128,
"grad_norm": 0.561730146408081,
"learning_rate": 4.226988382484361e-05,
"loss": 0.0322,
"step": 3530
},
{
"epoch": 3.163538873994638,
"grad_norm": 3.7698066234588623,
"learning_rate": 4.18230563002681e-05,
"loss": 0.0061,
"step": 3540
},
{
"epoch": 3.1724754244861484,
"grad_norm": 0.08852257579565048,
"learning_rate": 4.137622877569258e-05,
"loss": 0.002,
"step": 3550
},
{
"epoch": 3.181411974977659,
"grad_norm": 0.010241570882499218,
"learning_rate": 4.092940125111707e-05,
"loss": 0.0032,
"step": 3560
},
{
"epoch": 3.190348525469169,
"grad_norm": 0.02900160290300846,
"learning_rate": 4.048257372654156e-05,
"loss": 0.0021,
"step": 3570
},
{
"epoch": 3.1992850759606792,
"grad_norm": 0.012413430958986282,
"learning_rate": 4.003574620196605e-05,
"loss": 0.0016,
"step": 3580
},
{
"epoch": 3.2082216264521897,
"grad_norm": 0.011820780113339424,
"learning_rate": 3.958891867739053e-05,
"loss": 0.0156,
"step": 3590
},
{
"epoch": 3.2171581769436997,
"grad_norm": 0.0063424003310501575,
"learning_rate": 3.914209115281501e-05,
"loss": 0.0066,
"step": 3600
},
{
"epoch": 3.22609472743521,
"grad_norm": 0.014534726738929749,
"learning_rate": 3.8695263628239506e-05,
"loss": 0.0023,
"step": 3610
},
{
"epoch": 3.23503127792672,
"grad_norm": 0.0037305313162505627,
"learning_rate": 3.824843610366399e-05,
"loss": 0.0014,
"step": 3620
},
{
"epoch": 3.2439678284182305,
"grad_norm": 0.004174220375716686,
"learning_rate": 3.780160857908847e-05,
"loss": 0.0022,
"step": 3630
},
{
"epoch": 3.252904378909741,
"grad_norm": 0.02620732970535755,
"learning_rate": 3.735478105451296e-05,
"loss": 0.0033,
"step": 3640
},
{
"epoch": 3.2618409294012514,
"grad_norm": 0.008887135423719883,
"learning_rate": 3.690795352993745e-05,
"loss": 0.0137,
"step": 3650
},
{
"epoch": 3.2707774798927614,
"grad_norm": 0.0036694956943392754,
"learning_rate": 3.6461126005361935e-05,
"loss": 0.0016,
"step": 3660
},
{
"epoch": 3.279714030384272,
"grad_norm": 0.005121259950101376,
"learning_rate": 3.601429848078642e-05,
"loss": 0.0023,
"step": 3670
},
{
"epoch": 3.2886505808757818,
"grad_norm": 0.005332967732101679,
"learning_rate": 3.55674709562109e-05,
"loss": 0.0508,
"step": 3680
},
{
"epoch": 3.297587131367292,
"grad_norm": 0.008636276237666607,
"learning_rate": 3.5120643431635394e-05,
"loss": 0.0015,
"step": 3690
},
{
"epoch": 3.3065236818588026,
"grad_norm": 0.004048788454383612,
"learning_rate": 3.4673815907059876e-05,
"loss": 0.0015,
"step": 3700
},
{
"epoch": 3.3154602323503126,
"grad_norm": 0.013148046098649502,
"learning_rate": 3.4226988382484365e-05,
"loss": 0.0021,
"step": 3710
},
{
"epoch": 3.324396782841823,
"grad_norm": 0.003611048450693488,
"learning_rate": 3.3780160857908846e-05,
"loss": 0.0018,
"step": 3720
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.0047615463845431805,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.002,
"step": 3730
},
{
"epoch": 3.3422698838248435,
"grad_norm": 0.052058279514312744,
"learning_rate": 3.2886505808757823e-05,
"loss": 0.0017,
"step": 3740
},
{
"epoch": 3.351206434316354,
"grad_norm": 0.004867528565227985,
"learning_rate": 3.2439678284182305e-05,
"loss": 0.0015,
"step": 3750
},
{
"epoch": 3.3601429848078643,
"grad_norm": 0.005437952931970358,
"learning_rate": 3.1992850759606794e-05,
"loss": 0.0027,
"step": 3760
},
{
"epoch": 3.3690795352993743,
"grad_norm": 0.08657950907945633,
"learning_rate": 3.154602323503128e-05,
"loss": 0.0012,
"step": 3770
},
{
"epoch": 3.3780160857908847,
"grad_norm": 0.003917807713150978,
"learning_rate": 3.1099195710455764e-05,
"loss": 0.0016,
"step": 3780
},
{
"epoch": 3.386952636282395,
"grad_norm": 0.03561088442802429,
"learning_rate": 3.065236818588025e-05,
"loss": 0.0063,
"step": 3790
},
{
"epoch": 3.395889186773905,
"grad_norm": 0.021922320127487183,
"learning_rate": 3.0205540661304738e-05,
"loss": 0.0195,
"step": 3800
},
{
"epoch": 3.4048257372654156,
"grad_norm": 0.009989120066165924,
"learning_rate": 2.9758713136729223e-05,
"loss": 0.0027,
"step": 3810
},
{
"epoch": 3.413762287756926,
"grad_norm": 0.004757929127663374,
"learning_rate": 2.931188561215371e-05,
"loss": 0.0213,
"step": 3820
},
{
"epoch": 3.422698838248436,
"grad_norm": 0.005087022669613361,
"learning_rate": 2.8865058087578197e-05,
"loss": 0.012,
"step": 3830
},
{
"epoch": 3.4316353887399464,
"grad_norm": 0.04014687240123749,
"learning_rate": 2.8418230563002685e-05,
"loss": 0.0016,
"step": 3840
},
{
"epoch": 3.4405719392314564,
"grad_norm": 0.008556324057281017,
"learning_rate": 2.7971403038427167e-05,
"loss": 0.0466,
"step": 3850
},
{
"epoch": 3.449508489722967,
"grad_norm": 0.0066629331558942795,
"learning_rate": 2.7524575513851652e-05,
"loss": 0.0013,
"step": 3860
},
{
"epoch": 3.4584450402144773,
"grad_norm": 0.007047568913549185,
"learning_rate": 2.707774798927614e-05,
"loss": 0.0015,
"step": 3870
},
{
"epoch": 3.4673815907059877,
"grad_norm": 0.0033304065000265837,
"learning_rate": 2.6630920464700626e-05,
"loss": 0.0186,
"step": 3880
},
{
"epoch": 3.4763181411974977,
"grad_norm": 0.043915342539548874,
"learning_rate": 2.6184092940125114e-05,
"loss": 0.0013,
"step": 3890
},
{
"epoch": 3.485254691689008,
"grad_norm": 0.005252317525446415,
"learning_rate": 2.57372654155496e-05,
"loss": 0.0036,
"step": 3900
},
{
"epoch": 3.494191242180518,
"grad_norm": 0.005055012181401253,
"learning_rate": 2.5290437890974085e-05,
"loss": 0.0012,
"step": 3910
},
{
"epoch": 3.5031277926720286,
"grad_norm": 0.0049805790185928345,
"learning_rate": 2.484361036639857e-05,
"loss": 0.0157,
"step": 3920
},
{
"epoch": 3.512064343163539,
"grad_norm": 0.009514909237623215,
"learning_rate": 2.439678284182306e-05,
"loss": 0.0109,
"step": 3930
},
{
"epoch": 3.5210008936550494,
"grad_norm": 0.03643026947975159,
"learning_rate": 2.3949955317247544e-05,
"loss": 0.0019,
"step": 3940
},
{
"epoch": 3.5299374441465594,
"grad_norm": 0.056902140378952026,
"learning_rate": 2.3503127792672032e-05,
"loss": 0.0016,
"step": 3950
},
{
"epoch": 3.53887399463807,
"grad_norm": 0.10358071327209473,
"learning_rate": 2.3056300268096514e-05,
"loss": 0.005,
"step": 3960
},
{
"epoch": 3.54781054512958,
"grad_norm": 0.005386151373386383,
"learning_rate": 2.2609472743521002e-05,
"loss": 0.0021,
"step": 3970
},
{
"epoch": 3.5567470956210903,
"grad_norm": 0.007350238971412182,
"learning_rate": 2.2162645218945488e-05,
"loss": 0.0016,
"step": 3980
},
{
"epoch": 3.5656836461126007,
"grad_norm": 0.07326429337263107,
"learning_rate": 2.1715817694369976e-05,
"loss": 0.0084,
"step": 3990
},
{
"epoch": 3.5746201966041107,
"grad_norm": 0.005603461060672998,
"learning_rate": 2.126899016979446e-05,
"loss": 0.0011,
"step": 4000
},
{
"epoch": 3.5746201966041107,
"eval_accuracy": 0.9814070351758793,
"eval_loss": 0.0710952952504158,
"eval_runtime": 56.3405,
"eval_samples_per_second": 35.321,
"eval_steps_per_second": 4.42,
"step": 4000
},
{
"epoch": 3.583556747095621,
"grad_norm": 0.010818341746926308,
"learning_rate": 2.0822162645218946e-05,
"loss": 0.0038,
"step": 4010
},
{
"epoch": 3.592493297587131,
"grad_norm": 0.003599151037633419,
"learning_rate": 2.037533512064343e-05,
"loss": 0.0065,
"step": 4020
},
{
"epoch": 3.6014298480786415,
"grad_norm": 4.198567867279053,
"learning_rate": 1.992850759606792e-05,
"loss": 0.0083,
"step": 4030
},
{
"epoch": 3.610366398570152,
"grad_norm": 0.013494855724275112,
"learning_rate": 1.9481680071492405e-05,
"loss": 0.0045,
"step": 4040
},
{
"epoch": 3.6193029490616624,
"grad_norm": 0.0036234534345567226,
"learning_rate": 1.903485254691689e-05,
"loss": 0.0016,
"step": 4050
},
{
"epoch": 3.6282394995531724,
"grad_norm": 0.021920403465628624,
"learning_rate": 1.8588025022341376e-05,
"loss": 0.0012,
"step": 4060
},
{
"epoch": 3.637176050044683,
"grad_norm": 0.004384295083582401,
"learning_rate": 1.8141197497765864e-05,
"loss": 0.001,
"step": 4070
},
{
"epoch": 3.646112600536193,
"grad_norm": 0.03161391615867615,
"learning_rate": 1.769436997319035e-05,
"loss": 0.0026,
"step": 4080
},
{
"epoch": 3.6550491510277032,
"grad_norm": 0.0033394452184438705,
"learning_rate": 1.7247542448614838e-05,
"loss": 0.0267,
"step": 4090
},
{
"epoch": 3.6639857015192137,
"grad_norm": 0.01090541947633028,
"learning_rate": 1.680071492403932e-05,
"loss": 0.0014,
"step": 4100
},
{
"epoch": 3.672922252010724,
"grad_norm": 0.0053653959184885025,
"learning_rate": 1.6353887399463808e-05,
"loss": 0.0095,
"step": 4110
},
{
"epoch": 3.681858802502234,
"grad_norm": 0.032379720360040665,
"learning_rate": 1.5907059874888293e-05,
"loss": 0.0011,
"step": 4120
},
{
"epoch": 3.6907953529937445,
"grad_norm": 0.05944305285811424,
"learning_rate": 1.5460232350312782e-05,
"loss": 0.0015,
"step": 4130
},
{
"epoch": 3.6997319034852545,
"grad_norm": 0.0054045203141868114,
"learning_rate": 1.5013404825737265e-05,
"loss": 0.0012,
"step": 4140
},
{
"epoch": 3.708668453976765,
"grad_norm": 0.003021675394847989,
"learning_rate": 1.4566577301161752e-05,
"loss": 0.0023,
"step": 4150
},
{
"epoch": 3.7176050044682754,
"grad_norm": 0.007955508306622505,
"learning_rate": 1.4119749776586239e-05,
"loss": 0.0031,
"step": 4160
},
{
"epoch": 3.726541554959786,
"grad_norm": 0.005485454574227333,
"learning_rate": 1.3672922252010726e-05,
"loss": 0.0014,
"step": 4170
},
{
"epoch": 3.7354781054512958,
"grad_norm": 0.007910342887043953,
"learning_rate": 1.322609472743521e-05,
"loss": 0.0014,
"step": 4180
},
{
"epoch": 3.744414655942806,
"grad_norm": 0.011793126352131367,
"learning_rate": 1.2779267202859696e-05,
"loss": 0.001,
"step": 4190
},
{
"epoch": 3.753351206434316,
"grad_norm": 0.005442539695650339,
"learning_rate": 1.2332439678284183e-05,
"loss": 0.0015,
"step": 4200
},
{
"epoch": 3.7622877569258266,
"grad_norm": 1.1986395120620728,
"learning_rate": 1.188561215370867e-05,
"loss": 0.002,
"step": 4210
},
{
"epoch": 3.771224307417337,
"grad_norm": 0.006608502473682165,
"learning_rate": 1.1438784629133155e-05,
"loss": 0.0009,
"step": 4220
},
{
"epoch": 3.780160857908847,
"grad_norm": 0.0039040117990225554,
"learning_rate": 1.0991957104557642e-05,
"loss": 0.0013,
"step": 4230
},
{
"epoch": 3.7890974084003575,
"grad_norm": 0.0041880221106112,
"learning_rate": 1.0545129579982127e-05,
"loss": 0.0012,
"step": 4240
},
{
"epoch": 3.798033958891868,
"grad_norm": 0.003776776837185025,
"learning_rate": 1.0098302055406614e-05,
"loss": 0.0013,
"step": 4250
},
{
"epoch": 3.806970509383378,
"grad_norm": 0.2970888614654541,
"learning_rate": 9.651474530831099e-06,
"loss": 0.0014,
"step": 4260
},
{
"epoch": 3.8159070598748883,
"grad_norm": 0.003879937343299389,
"learning_rate": 9.204647006255586e-06,
"loss": 0.0013,
"step": 4270
},
{
"epoch": 3.8248436103663987,
"grad_norm": 3.5169312953948975,
"learning_rate": 8.757819481680071e-06,
"loss": 0.0035,
"step": 4280
},
{
"epoch": 3.8337801608579087,
"grad_norm": 0.004920534789562225,
"learning_rate": 8.310991957104558e-06,
"loss": 0.001,
"step": 4290
},
{
"epoch": 3.842716711349419,
"grad_norm": 0.0035059740766882896,
"learning_rate": 7.864164432529045e-06,
"loss": 0.0094,
"step": 4300
},
{
"epoch": 3.851653261840929,
"grad_norm": 0.004144645761698484,
"learning_rate": 7.41733690795353e-06,
"loss": 0.001,
"step": 4310
},
{
"epoch": 3.8605898123324396,
"grad_norm": 0.006385812535881996,
"learning_rate": 6.970509383378017e-06,
"loss": 0.001,
"step": 4320
},
{
"epoch": 3.86952636282395,
"grad_norm": 0.003677819389849901,
"learning_rate": 6.523681858802503e-06,
"loss": 0.0009,
"step": 4330
},
{
"epoch": 3.8784629133154604,
"grad_norm": 0.003563833888620138,
"learning_rate": 6.076854334226989e-06,
"loss": 0.0251,
"step": 4340
},
{
"epoch": 3.8873994638069704,
"grad_norm": 0.012182756327092648,
"learning_rate": 5.630026809651475e-06,
"loss": 0.0422,
"step": 4350
},
{
"epoch": 3.896336014298481,
"grad_norm": 0.004781792871654034,
"learning_rate": 5.1831992850759615e-06,
"loss": 0.0021,
"step": 4360
},
{
"epoch": 3.905272564789991,
"grad_norm": 0.003455075901001692,
"learning_rate": 4.7363717605004475e-06,
"loss": 0.0182,
"step": 4370
},
{
"epoch": 3.9142091152815013,
"grad_norm": 0.00627366965636611,
"learning_rate": 4.2895442359249335e-06,
"loss": 0.0148,
"step": 4380
},
{
"epoch": 3.9231456657730117,
"grad_norm": 0.015941530466079712,
"learning_rate": 3.8427167113494195e-06,
"loss": 0.0015,
"step": 4390
},
{
"epoch": 3.932082216264522,
"grad_norm": 0.004724125377833843,
"learning_rate": 3.3958891867739055e-06,
"loss": 0.0093,
"step": 4400
},
{
"epoch": 3.941018766756032,
"grad_norm": 0.0062377783469855785,
"learning_rate": 2.9490616621983915e-06,
"loss": 0.0011,
"step": 4410
},
{
"epoch": 3.9499553172475426,
"grad_norm": 0.0042613474652171135,
"learning_rate": 2.502234137622878e-06,
"loss": 0.0141,
"step": 4420
},
{
"epoch": 3.9588918677390526,
"grad_norm": 0.005040575284510851,
"learning_rate": 2.055406613047364e-06,
"loss": 0.0061,
"step": 4430
},
{
"epoch": 3.967828418230563,
"grad_norm": 0.004678263328969479,
"learning_rate": 1.60857908847185e-06,
"loss": 0.0011,
"step": 4440
},
{
"epoch": 3.9767649687220734,
"grad_norm": 0.0033705062232911587,
"learning_rate": 1.161751563896336e-06,
"loss": 0.0065,
"step": 4450
},
{
"epoch": 3.9857015192135834,
"grad_norm": 0.004543005023151636,
"learning_rate": 7.149240393208222e-07,
"loss": 0.0011,
"step": 4460
},
{
"epoch": 3.994638069705094,
"grad_norm": 0.01603855937719345,
"learning_rate": 2.6809651474530835e-07,
"loss": 0.001,
"step": 4470
},
{
"epoch": 4.0,
"step": 4476,
"total_flos": 5.549295064059888e+18,
"train_loss": 0.1165861947240576,
"train_runtime": 2488.5837,
"train_samples_per_second": 28.775,
"train_steps_per_second": 1.799
}
],
"logging_steps": 10,
"max_steps": 4476,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.549295064059888e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}