Master_V1 / trainer_state.json
sai1881's picture
Upload 9 files
e0ca5e3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8310652178429703,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003324260871371881,
"grad_norm": 2.5143792629241943,
"learning_rate": 1.6622340425531916e-08,
"loss": 9.0836,
"step": 10
},
{
"epoch": 0.006648521742743762,
"grad_norm": 2.6173431873321533,
"learning_rate": 3.324468085106383e-08,
"loss": 9.0829,
"step": 20
},
{
"epoch": 0.009972782614115643,
"grad_norm": 2.5349628925323486,
"learning_rate": 4.9867021276595746e-08,
"loss": 9.0061,
"step": 30
},
{
"epoch": 0.013297043485487523,
"grad_norm": 2.3916308879852295,
"learning_rate": 6.648936170212767e-08,
"loss": 8.9747,
"step": 40
},
{
"epoch": 0.016621304356859403,
"grad_norm": 2.5103342533111572,
"learning_rate": 8.311170212765958e-08,
"loss": 9.0057,
"step": 50
},
{
"epoch": 0.019945565228231286,
"grad_norm": 2.421079397201538,
"learning_rate": 9.973404255319149e-08,
"loss": 8.9885,
"step": 60
},
{
"epoch": 0.023269826099603166,
"grad_norm": 2.6052393913269043,
"learning_rate": 1.163563829787234e-07,
"loss": 8.9706,
"step": 70
},
{
"epoch": 0.026594086970975046,
"grad_norm": 2.376847505569458,
"learning_rate": 1.3297872340425533e-07,
"loss": 9.0211,
"step": 80
},
{
"epoch": 0.02991834784234693,
"grad_norm": 2.6200971603393555,
"learning_rate": 1.4960106382978723e-07,
"loss": 8.9903,
"step": 90
},
{
"epoch": 0.033242608713718806,
"grad_norm": 2.515320301055908,
"learning_rate": 1.6622340425531916e-07,
"loss": 8.9643,
"step": 100
},
{
"epoch": 0.03656686958509069,
"grad_norm": 2.4840102195739746,
"learning_rate": 1.8284574468085108e-07,
"loss": 8.9761,
"step": 110
},
{
"epoch": 0.03989113045646257,
"grad_norm": 2.5950074195861816,
"learning_rate": 1.9946808510638298e-07,
"loss": 9.0101,
"step": 120
},
{
"epoch": 0.04321539132783445,
"grad_norm": 2.530604839324951,
"learning_rate": 2.160904255319149e-07,
"loss": 8.961,
"step": 130
},
{
"epoch": 0.04653965219920633,
"grad_norm": 2.5579464435577393,
"learning_rate": 2.327127659574468e-07,
"loss": 8.8733,
"step": 140
},
{
"epoch": 0.04986391307057821,
"grad_norm": 2.638901472091675,
"learning_rate": 2.4933510638297876e-07,
"loss": 8.9534,
"step": 150
},
{
"epoch": 0.05318817394195009,
"grad_norm": 2.6817493438720703,
"learning_rate": 2.6595744680851066e-07,
"loss": 9.0014,
"step": 160
},
{
"epoch": 0.05651243481332197,
"grad_norm": 2.6700024604797363,
"learning_rate": 2.8257978723404256e-07,
"loss": 8.8832,
"step": 170
},
{
"epoch": 0.05983669568469386,
"grad_norm": 2.794243335723877,
"learning_rate": 2.9920212765957446e-07,
"loss": 8.9012,
"step": 180
},
{
"epoch": 0.06316095655606574,
"grad_norm": 3.000873327255249,
"learning_rate": 3.1582446808510636e-07,
"loss": 8.7874,
"step": 190
},
{
"epoch": 0.06648521742743761,
"grad_norm": 2.872612714767456,
"learning_rate": 3.324468085106383e-07,
"loss": 8.8558,
"step": 200
},
{
"epoch": 0.0698094782988095,
"grad_norm": 2.9133315086364746,
"learning_rate": 3.490691489361702e-07,
"loss": 8.8333,
"step": 210
},
{
"epoch": 0.07313373917018139,
"grad_norm": 3.1017534732818604,
"learning_rate": 3.6569148936170217e-07,
"loss": 8.8199,
"step": 220
},
{
"epoch": 0.07645800004155326,
"grad_norm": 2.9153056144714355,
"learning_rate": 3.8231382978723407e-07,
"loss": 8.8266,
"step": 230
},
{
"epoch": 0.07978226091292515,
"grad_norm": 3.0996434688568115,
"learning_rate": 3.9893617021276597e-07,
"loss": 8.7202,
"step": 240
},
{
"epoch": 0.08310652178429702,
"grad_norm": 3.257809638977051,
"learning_rate": 4.1555851063829787e-07,
"loss": 8.6149,
"step": 250
},
{
"epoch": 0.0864307826556689,
"grad_norm": 3.378631353378296,
"learning_rate": 4.321808510638298e-07,
"loss": 8.5092,
"step": 260
},
{
"epoch": 0.08975504352704078,
"grad_norm": 3.3546876907348633,
"learning_rate": 4.488031914893618e-07,
"loss": 8.477,
"step": 270
},
{
"epoch": 0.09307930439841267,
"grad_norm": 3.292569637298584,
"learning_rate": 4.654255319148936e-07,
"loss": 8.4594,
"step": 280
},
{
"epoch": 0.09640356526978455,
"grad_norm": 3.190239906311035,
"learning_rate": 4.820478723404255e-07,
"loss": 8.3134,
"step": 290
},
{
"epoch": 0.09972782614115643,
"grad_norm": 3.2521212100982666,
"learning_rate": 4.986702127659575e-07,
"loss": 8.2896,
"step": 300
},
{
"epoch": 0.10305208701252831,
"grad_norm": 3.399919033050537,
"learning_rate": 5.152925531914893e-07,
"loss": 8.1656,
"step": 310
},
{
"epoch": 0.10637634788390019,
"grad_norm": 3.412688970565796,
"learning_rate": 5.319148936170213e-07,
"loss": 7.9764,
"step": 320
},
{
"epoch": 0.10970060875527207,
"grad_norm": 3.2669174671173096,
"learning_rate": 5.485372340425532e-07,
"loss": 7.9755,
"step": 330
},
{
"epoch": 0.11302486962664395,
"grad_norm": 3.405444383621216,
"learning_rate": 5.651595744680851e-07,
"loss": 7.8587,
"step": 340
},
{
"epoch": 0.11634913049801583,
"grad_norm": 3.2224161624908447,
"learning_rate": 5.81781914893617e-07,
"loss": 7.7357,
"step": 350
},
{
"epoch": 0.11967339136938772,
"grad_norm": 3.230048894882202,
"learning_rate": 5.984042553191489e-07,
"loss": 7.5667,
"step": 360
},
{
"epoch": 0.12299765224075959,
"grad_norm": 3.2728312015533447,
"learning_rate": 6.150265957446809e-07,
"loss": 7.4844,
"step": 370
},
{
"epoch": 0.12632191311213148,
"grad_norm": 3.200800895690918,
"learning_rate": 6.316489361702127e-07,
"loss": 7.3059,
"step": 380
},
{
"epoch": 0.12964617398350337,
"grad_norm": 3.075329065322876,
"learning_rate": 6.482712765957447e-07,
"loss": 7.1618,
"step": 390
},
{
"epoch": 0.13297043485487522,
"grad_norm": 3.1853721141815186,
"learning_rate": 6.648936170212766e-07,
"loss": 7.1392,
"step": 400
},
{
"epoch": 0.1362946957262471,
"grad_norm": 3.0336828231811523,
"learning_rate": 6.815159574468085e-07,
"loss": 6.974,
"step": 410
},
{
"epoch": 0.139618956597619,
"grad_norm": 3.027355670928955,
"learning_rate": 6.981382978723404e-07,
"loss": 6.7714,
"step": 420
},
{
"epoch": 0.14294321746899089,
"grad_norm": 2.99857497215271,
"learning_rate": 7.147606382978723e-07,
"loss": 6.6538,
"step": 430
},
{
"epoch": 0.14626747834036277,
"grad_norm": 2.840437650680542,
"learning_rate": 7.313829787234043e-07,
"loss": 6.4758,
"step": 440
},
{
"epoch": 0.14959173921173463,
"grad_norm": 3.076049566268921,
"learning_rate": 7.480053191489362e-07,
"loss": 6.3225,
"step": 450
},
{
"epoch": 0.15291600008310652,
"grad_norm": 2.8588602542877197,
"learning_rate": 7.646276595744681e-07,
"loss": 6.1646,
"step": 460
},
{
"epoch": 0.1562402609544784,
"grad_norm": 2.9168858528137207,
"learning_rate": 7.8125e-07,
"loss": 6.0828,
"step": 470
},
{
"epoch": 0.1595645218258503,
"grad_norm": 2.795363187789917,
"learning_rate": 7.978723404255319e-07,
"loss": 5.9192,
"step": 480
},
{
"epoch": 0.16288878269722215,
"grad_norm": 2.3897600173950195,
"learning_rate": 8.144946808510639e-07,
"loss": 5.6675,
"step": 490
},
{
"epoch": 0.16621304356859404,
"grad_norm": 2.279939651489258,
"learning_rate": 8.311170212765957e-07,
"loss": 5.5443,
"step": 500
},
{
"epoch": 0.16953730443996592,
"grad_norm": 2.394994020462036,
"learning_rate": 8.477393617021276e-07,
"loss": 5.4127,
"step": 510
},
{
"epoch": 0.1728615653113378,
"grad_norm": 2.3148529529571533,
"learning_rate": 8.643617021276596e-07,
"loss": 5.2102,
"step": 520
},
{
"epoch": 0.1761858261827097,
"grad_norm": 2.053243637084961,
"learning_rate": 8.809840425531915e-07,
"loss": 5.1204,
"step": 530
},
{
"epoch": 0.17951008705408156,
"grad_norm": 2.0687060356140137,
"learning_rate": 8.976063829787235e-07,
"loss": 4.9786,
"step": 540
},
{
"epoch": 0.18283434792545344,
"grad_norm": 1.8042306900024414,
"learning_rate": 9.142287234042553e-07,
"loss": 4.7735,
"step": 550
},
{
"epoch": 0.18615860879682533,
"grad_norm": 1.8975441455841064,
"learning_rate": 9.308510638297872e-07,
"loss": 4.6749,
"step": 560
},
{
"epoch": 0.18948286966819722,
"grad_norm": 1.6640989780426025,
"learning_rate": 9.474734042553192e-07,
"loss": 4.529,
"step": 570
},
{
"epoch": 0.1928071305395691,
"grad_norm": 1.9563913345336914,
"learning_rate": 9.64095744680851e-07,
"loss": 4.4255,
"step": 580
},
{
"epoch": 0.19613139141094096,
"grad_norm": 1.433192253112793,
"learning_rate": 9.80718085106383e-07,
"loss": 4.3369,
"step": 590
},
{
"epoch": 0.19945565228231285,
"grad_norm": 1.6940258741378784,
"learning_rate": 9.97340425531915e-07,
"loss": 4.2264,
"step": 600
},
{
"epoch": 0.20277991315368474,
"grad_norm": 1.3721990585327148,
"learning_rate": 1.0139627659574467e-06,
"loss": 4.0771,
"step": 610
},
{
"epoch": 0.20610417402505662,
"grad_norm": 1.3481799364089966,
"learning_rate": 1.0305851063829786e-06,
"loss": 3.9652,
"step": 620
},
{
"epoch": 0.20942843489642848,
"grad_norm": 1.3010597229003906,
"learning_rate": 1.0472074468085108e-06,
"loss": 3.9205,
"step": 630
},
{
"epoch": 0.21275269576780037,
"grad_norm": 1.551216721534729,
"learning_rate": 1.0638297872340427e-06,
"loss": 3.8234,
"step": 640
},
{
"epoch": 0.21607695663917226,
"grad_norm": 1.3280216455459595,
"learning_rate": 1.0804521276595746e-06,
"loss": 3.6898,
"step": 650
},
{
"epoch": 0.21940121751054414,
"grad_norm": 1.0909334421157837,
"learning_rate": 1.0970744680851065e-06,
"loss": 3.6489,
"step": 660
},
{
"epoch": 0.22272547838191603,
"grad_norm": 1.345831036567688,
"learning_rate": 1.1136968085106384e-06,
"loss": 3.5296,
"step": 670
},
{
"epoch": 0.2260497392532879,
"grad_norm": 1.0882962942123413,
"learning_rate": 1.1303191489361703e-06,
"loss": 3.489,
"step": 680
},
{
"epoch": 0.22937400012465978,
"grad_norm": 0.9840554594993591,
"learning_rate": 1.1469414893617022e-06,
"loss": 3.4164,
"step": 690
},
{
"epoch": 0.23269826099603166,
"grad_norm": 1.0956693887710571,
"learning_rate": 1.163563829787234e-06,
"loss": 3.3211,
"step": 700
},
{
"epoch": 0.23602252186740355,
"grad_norm": 0.8875247240066528,
"learning_rate": 1.1801861702127662e-06,
"loss": 3.2647,
"step": 710
},
{
"epoch": 0.23934678273877544,
"grad_norm": 1.268930196762085,
"learning_rate": 1.1968085106382979e-06,
"loss": 3.2033,
"step": 720
},
{
"epoch": 0.2426710436101473,
"grad_norm": 0.9430990815162659,
"learning_rate": 1.2134308510638298e-06,
"loss": 3.1317,
"step": 730
},
{
"epoch": 0.24599530448151918,
"grad_norm": 0.9196615219116211,
"learning_rate": 1.2300531914893619e-06,
"loss": 3.0706,
"step": 740
},
{
"epoch": 0.24931956535289107,
"grad_norm": 0.7046869993209839,
"learning_rate": 1.2466755319148936e-06,
"loss": 3.0142,
"step": 750
},
{
"epoch": 0.25264382622426296,
"grad_norm": 0.9173153638839722,
"learning_rate": 1.2632978723404255e-06,
"loss": 2.949,
"step": 760
},
{
"epoch": 0.25596808709563484,
"grad_norm": 0.8014841675758362,
"learning_rate": 1.2799202127659576e-06,
"loss": 2.9325,
"step": 770
},
{
"epoch": 0.25929234796700673,
"grad_norm": 0.9520502686500549,
"learning_rate": 1.2965425531914895e-06,
"loss": 2.859,
"step": 780
},
{
"epoch": 0.2626166088383786,
"grad_norm": 0.7679387331008911,
"learning_rate": 1.3131648936170214e-06,
"loss": 2.8509,
"step": 790
},
{
"epoch": 0.26594086970975045,
"grad_norm": 0.7660825252532959,
"learning_rate": 1.3297872340425533e-06,
"loss": 2.7896,
"step": 800
},
{
"epoch": 0.26926513058112234,
"grad_norm": 0.7754834294319153,
"learning_rate": 1.3464095744680852e-06,
"loss": 2.736,
"step": 810
},
{
"epoch": 0.2725893914524942,
"grad_norm": 0.5802922248840332,
"learning_rate": 1.363031914893617e-06,
"loss": 2.6962,
"step": 820
},
{
"epoch": 0.2759136523238661,
"grad_norm": 0.6394158601760864,
"learning_rate": 1.379654255319149e-06,
"loss": 2.6656,
"step": 830
},
{
"epoch": 0.279237913195238,
"grad_norm": 0.6503139138221741,
"learning_rate": 1.3962765957446809e-06,
"loss": 2.616,
"step": 840
},
{
"epoch": 0.2825621740666099,
"grad_norm": 0.6165557503700256,
"learning_rate": 1.412898936170213e-06,
"loss": 2.5971,
"step": 850
},
{
"epoch": 0.28588643493798177,
"grad_norm": 0.6192012429237366,
"learning_rate": 1.4295212765957447e-06,
"loss": 2.5536,
"step": 860
},
{
"epoch": 0.28921069580935366,
"grad_norm": 0.6266525983810425,
"learning_rate": 1.4461436170212766e-06,
"loss": 2.5036,
"step": 870
},
{
"epoch": 0.29253495668072554,
"grad_norm": 0.5376760363578796,
"learning_rate": 1.4627659574468087e-06,
"loss": 2.5136,
"step": 880
},
{
"epoch": 0.2958592175520974,
"grad_norm": 0.6490041613578796,
"learning_rate": 1.4793882978723404e-06,
"loss": 2.4638,
"step": 890
},
{
"epoch": 0.29918347842346926,
"grad_norm": 0.6368073225021362,
"learning_rate": 1.4960106382978725e-06,
"loss": 2.4258,
"step": 900
},
{
"epoch": 0.30250773929484115,
"grad_norm": 0.5121726989746094,
"learning_rate": 1.5126329787234044e-06,
"loss": 2.4016,
"step": 910
},
{
"epoch": 0.30583200016621304,
"grad_norm": 0.5835744738578796,
"learning_rate": 1.5292553191489363e-06,
"loss": 2.4192,
"step": 920
},
{
"epoch": 0.3091562610375849,
"grad_norm": 0.5275241732597351,
"learning_rate": 1.5458776595744682e-06,
"loss": 2.3687,
"step": 930
},
{
"epoch": 0.3124805219089568,
"grad_norm": 0.4900510609149933,
"learning_rate": 1.5625e-06,
"loss": 2.3208,
"step": 940
},
{
"epoch": 0.3158047827803287,
"grad_norm": 0.4609052240848541,
"learning_rate": 1.5791223404255322e-06,
"loss": 2.3363,
"step": 950
},
{
"epoch": 0.3191290436517006,
"grad_norm": 0.461566299200058,
"learning_rate": 1.5957446808510639e-06,
"loss": 2.2793,
"step": 960
},
{
"epoch": 0.32245330452307247,
"grad_norm": 0.49795401096343994,
"learning_rate": 1.6123670212765958e-06,
"loss": 2.2845,
"step": 970
},
{
"epoch": 0.3257775653944443,
"grad_norm": 0.4422404170036316,
"learning_rate": 1.6289893617021279e-06,
"loss": 2.2744,
"step": 980
},
{
"epoch": 0.3291018262658162,
"grad_norm": 0.4161861538887024,
"learning_rate": 1.6456117021276596e-06,
"loss": 2.2463,
"step": 990
},
{
"epoch": 0.3324260871371881,
"grad_norm": 0.46071523427963257,
"learning_rate": 1.6622340425531915e-06,
"loss": 2.2271,
"step": 1000
},
{
"epoch": 0.33575034800855996,
"grad_norm": 0.3772067129611969,
"learning_rate": 1.6788563829787236e-06,
"loss": 2.2119,
"step": 1010
},
{
"epoch": 0.33907460887993185,
"grad_norm": 0.44782117009162903,
"learning_rate": 1.6954787234042553e-06,
"loss": 2.2022,
"step": 1020
},
{
"epoch": 0.34239886975130374,
"grad_norm": 0.4486360251903534,
"learning_rate": 1.7121010638297872e-06,
"loss": 2.1723,
"step": 1030
},
{
"epoch": 0.3457231306226756,
"grad_norm": 0.47423475980758667,
"learning_rate": 1.7287234042553193e-06,
"loss": 2.1295,
"step": 1040
},
{
"epoch": 0.3490473914940475,
"grad_norm": 0.4199342131614685,
"learning_rate": 1.745345744680851e-06,
"loss": 2.1387,
"step": 1050
},
{
"epoch": 0.3523716523654194,
"grad_norm": 0.43744415044784546,
"learning_rate": 1.761968085106383e-06,
"loss": 2.1195,
"step": 1060
},
{
"epoch": 0.3556959132367913,
"grad_norm": 0.3780044913291931,
"learning_rate": 1.778590425531915e-06,
"loss": 2.1194,
"step": 1070
},
{
"epoch": 0.3590201741081631,
"grad_norm": 0.40349099040031433,
"learning_rate": 1.795212765957447e-06,
"loss": 2.1005,
"step": 1080
},
{
"epoch": 0.362344434979535,
"grad_norm": 0.378764271736145,
"learning_rate": 1.8118351063829788e-06,
"loss": 2.0757,
"step": 1090
},
{
"epoch": 0.3656686958509069,
"grad_norm": 0.34115588665008545,
"learning_rate": 1.8284574468085107e-06,
"loss": 2.0591,
"step": 1100
},
{
"epoch": 0.3689929567222788,
"grad_norm": 0.39553964138031006,
"learning_rate": 1.8450797872340428e-06,
"loss": 2.0298,
"step": 1110
},
{
"epoch": 0.37231721759365066,
"grad_norm": 0.36110466718673706,
"learning_rate": 1.8617021276595745e-06,
"loss": 2.0113,
"step": 1120
},
{
"epoch": 0.37564147846502255,
"grad_norm": 0.33477863669395447,
"learning_rate": 1.8783244680851066e-06,
"loss": 2.0197,
"step": 1130
},
{
"epoch": 0.37896573933639444,
"grad_norm": 0.43919846415519714,
"learning_rate": 1.8949468085106385e-06,
"loss": 1.9794,
"step": 1140
},
{
"epoch": 0.3822900002077663,
"grad_norm": 0.3243393898010254,
"learning_rate": 1.9115691489361704e-06,
"loss": 1.9667,
"step": 1150
},
{
"epoch": 0.3856142610791382,
"grad_norm": 0.3350262939929962,
"learning_rate": 1.928191489361702e-06,
"loss": 1.978,
"step": 1160
},
{
"epoch": 0.38893852195051004,
"grad_norm": 0.3365063965320587,
"learning_rate": 1.944813829787234e-06,
"loss": 1.9701,
"step": 1170
},
{
"epoch": 0.39226278282188193,
"grad_norm": 0.3240489661693573,
"learning_rate": 1.961436170212766e-06,
"loss": 1.9465,
"step": 1180
},
{
"epoch": 0.3955870436932538,
"grad_norm": 0.3239437937736511,
"learning_rate": 1.978058510638298e-06,
"loss": 1.9253,
"step": 1190
},
{
"epoch": 0.3989113045646257,
"grad_norm": 0.3397749364376068,
"learning_rate": 1.99468085106383e-06,
"loss": 1.9057,
"step": 1200
},
{
"epoch": 0.4022355654359976,
"grad_norm": 0.2915981113910675,
"learning_rate": 2.011303191489362e-06,
"loss": 1.9047,
"step": 1210
},
{
"epoch": 0.4055598263073695,
"grad_norm": 0.39456045627593994,
"learning_rate": 2.0279255319148935e-06,
"loss": 1.9144,
"step": 1220
},
{
"epoch": 0.40888408717874136,
"grad_norm": 0.2593387961387634,
"learning_rate": 2.0445478723404256e-06,
"loss": 1.8969,
"step": 1230
},
{
"epoch": 0.41220834805011325,
"grad_norm": 0.30935177206993103,
"learning_rate": 2.0611702127659573e-06,
"loss": 1.8931,
"step": 1240
},
{
"epoch": 0.41553260892148514,
"grad_norm": 0.27917250990867615,
"learning_rate": 2.0777925531914894e-06,
"loss": 1.8899,
"step": 1250
},
{
"epoch": 0.41885686979285697,
"grad_norm": 0.25976502895355225,
"learning_rate": 2.0944148936170215e-06,
"loss": 1.8503,
"step": 1260
},
{
"epoch": 0.42218113066422885,
"grad_norm": 0.31833794713020325,
"learning_rate": 2.111037234042553e-06,
"loss": 1.8527,
"step": 1270
},
{
"epoch": 0.42550539153560074,
"grad_norm": 0.2671976685523987,
"learning_rate": 2.1276595744680853e-06,
"loss": 1.8505,
"step": 1280
},
{
"epoch": 0.42882965240697263,
"grad_norm": 0.3245258629322052,
"learning_rate": 2.144281914893617e-06,
"loss": 1.864,
"step": 1290
},
{
"epoch": 0.4321539132783445,
"grad_norm": 0.2622531056404114,
"learning_rate": 2.160904255319149e-06,
"loss": 1.8301,
"step": 1300
},
{
"epoch": 0.4354781741497164,
"grad_norm": 0.3247709274291992,
"learning_rate": 2.177526595744681e-06,
"loss": 1.812,
"step": 1310
},
{
"epoch": 0.4388024350210883,
"grad_norm": 0.26424384117126465,
"learning_rate": 2.194148936170213e-06,
"loss": 1.7958,
"step": 1320
},
{
"epoch": 0.4421266958924602,
"grad_norm": 0.2569092810153961,
"learning_rate": 2.210771276595745e-06,
"loss": 1.8147,
"step": 1330
},
{
"epoch": 0.44545095676383206,
"grad_norm": 0.2393629103899002,
"learning_rate": 2.2273936170212767e-06,
"loss": 1.7976,
"step": 1340
},
{
"epoch": 0.44877521763520395,
"grad_norm": 0.232402965426445,
"learning_rate": 2.244015957446809e-06,
"loss": 1.7597,
"step": 1350
},
{
"epoch": 0.4520994785065758,
"grad_norm": 0.26385971903800964,
"learning_rate": 2.2606382978723405e-06,
"loss": 1.7781,
"step": 1360
},
{
"epoch": 0.45542373937794767,
"grad_norm": 0.2671038806438446,
"learning_rate": 2.277260638297872e-06,
"loss": 1.7583,
"step": 1370
},
{
"epoch": 0.45874800024931955,
"grad_norm": 0.27096447348594666,
"learning_rate": 2.2938829787234043e-06,
"loss": 1.7402,
"step": 1380
},
{
"epoch": 0.46207226112069144,
"grad_norm": 0.2245018631219864,
"learning_rate": 2.3105053191489364e-06,
"loss": 1.7644,
"step": 1390
},
{
"epoch": 0.46539652199206333,
"grad_norm": 0.20663714408874512,
"learning_rate": 2.327127659574468e-06,
"loss": 1.7519,
"step": 1400
},
{
"epoch": 0.4687207828634352,
"grad_norm": 0.26273128390312195,
"learning_rate": 2.3437500000000002e-06,
"loss": 1.7312,
"step": 1410
},
{
"epoch": 0.4720450437348071,
"grad_norm": 0.24725256860256195,
"learning_rate": 2.3603723404255323e-06,
"loss": 1.7217,
"step": 1420
},
{
"epoch": 0.475369304606179,
"grad_norm": 0.25341796875,
"learning_rate": 2.376994680851064e-06,
"loss": 1.7246,
"step": 1430
},
{
"epoch": 0.4786935654775509,
"grad_norm": 0.21035414934158325,
"learning_rate": 2.3936170212765957e-06,
"loss": 1.7017,
"step": 1440
},
{
"epoch": 0.4820178263489227,
"grad_norm": 0.21454143524169922,
"learning_rate": 2.410239361702128e-06,
"loss": 1.7049,
"step": 1450
},
{
"epoch": 0.4853420872202946,
"grad_norm": 0.22413010895252228,
"learning_rate": 2.4268617021276595e-06,
"loss": 1.6809,
"step": 1460
},
{
"epoch": 0.4886663480916665,
"grad_norm": 0.2039473056793213,
"learning_rate": 2.4434840425531916e-06,
"loss": 1.6873,
"step": 1470
},
{
"epoch": 0.49199060896303837,
"grad_norm": 0.18895457684993744,
"learning_rate": 2.4601063829787237e-06,
"loss": 1.69,
"step": 1480
},
{
"epoch": 0.49531486983441025,
"grad_norm": 0.21047964692115784,
"learning_rate": 2.4767287234042554e-06,
"loss": 1.681,
"step": 1490
},
{
"epoch": 0.49863913070578214,
"grad_norm": 0.2226460874080658,
"learning_rate": 2.493351063829787e-06,
"loss": 1.6613,
"step": 1500
},
{
"epoch": 0.501963391577154,
"grad_norm": 0.21892835199832916,
"learning_rate": 2.5099734042553192e-06,
"loss": 1.6376,
"step": 1510
},
{
"epoch": 0.5052876524485259,
"grad_norm": 0.20363831520080566,
"learning_rate": 2.526595744680851e-06,
"loss": 1.6541,
"step": 1520
},
{
"epoch": 0.5086119133198977,
"grad_norm": 0.1988699585199356,
"learning_rate": 2.543218085106383e-06,
"loss": 1.6422,
"step": 1530
},
{
"epoch": 0.5119361741912697,
"grad_norm": 0.2050096094608307,
"learning_rate": 2.559840425531915e-06,
"loss": 1.6377,
"step": 1540
},
{
"epoch": 0.5152604350626415,
"grad_norm": 0.23265878856182098,
"learning_rate": 2.5764627659574472e-06,
"loss": 1.6251,
"step": 1550
},
{
"epoch": 0.5185846959340135,
"grad_norm": 0.2024969905614853,
"learning_rate": 2.593085106382979e-06,
"loss": 1.6048,
"step": 1560
},
{
"epoch": 0.5219089568053853,
"grad_norm": 0.21343863010406494,
"learning_rate": 2.6097074468085106e-06,
"loss": 1.6195,
"step": 1570
},
{
"epoch": 0.5252332176767572,
"grad_norm": 0.1862565129995346,
"learning_rate": 2.6263297872340427e-06,
"loss": 1.5991,
"step": 1580
},
{
"epoch": 0.5285574785481291,
"grad_norm": 0.22765249013900757,
"learning_rate": 2.6429521276595744e-06,
"loss": 1.5957,
"step": 1590
},
{
"epoch": 0.5318817394195009,
"grad_norm": 0.19874997437000275,
"learning_rate": 2.6595744680851065e-06,
"loss": 1.5847,
"step": 1600
},
{
"epoch": 0.5352060002908728,
"grad_norm": 0.25979486107826233,
"learning_rate": 2.6761968085106386e-06,
"loss": 1.6046,
"step": 1610
},
{
"epoch": 0.5385302611622447,
"grad_norm": 0.1831529289484024,
"learning_rate": 2.6928191489361703e-06,
"loss": 1.5835,
"step": 1620
},
{
"epoch": 0.5418545220336166,
"grad_norm": 0.2680751085281372,
"learning_rate": 2.7094414893617024e-06,
"loss": 1.6009,
"step": 1630
},
{
"epoch": 0.5451787829049884,
"grad_norm": 0.18160907924175262,
"learning_rate": 2.726063829787234e-06,
"loss": 1.5666,
"step": 1640
},
{
"epoch": 0.5485030437763604,
"grad_norm": 0.22875571250915527,
"learning_rate": 2.742686170212766e-06,
"loss": 1.5614,
"step": 1650
},
{
"epoch": 0.5518273046477322,
"grad_norm": 0.21110033988952637,
"learning_rate": 2.759308510638298e-06,
"loss": 1.5707,
"step": 1660
},
{
"epoch": 0.5551515655191042,
"grad_norm": 0.1887374073266983,
"learning_rate": 2.77593085106383e-06,
"loss": 1.5781,
"step": 1670
},
{
"epoch": 0.558475826390476,
"grad_norm": 0.1916954219341278,
"learning_rate": 2.7925531914893617e-06,
"loss": 1.563,
"step": 1680
},
{
"epoch": 0.5618000872618478,
"grad_norm": 0.21001753211021423,
"learning_rate": 2.809175531914894e-06,
"loss": 1.5495,
"step": 1690
},
{
"epoch": 0.5651243481332198,
"grad_norm": 0.1702377200126648,
"learning_rate": 2.825797872340426e-06,
"loss": 1.5427,
"step": 1700
},
{
"epoch": 0.5684486090045916,
"grad_norm": 0.19061295688152313,
"learning_rate": 2.8424202127659576e-06,
"loss": 1.5387,
"step": 1710
},
{
"epoch": 0.5717728698759635,
"grad_norm": 0.17503058910369873,
"learning_rate": 2.8590425531914893e-06,
"loss": 1.5154,
"step": 1720
},
{
"epoch": 0.5750971307473354,
"grad_norm": 0.1703094244003296,
"learning_rate": 2.8756648936170214e-06,
"loss": 1.5209,
"step": 1730
},
{
"epoch": 0.5784213916187073,
"grad_norm": 0.22713126242160797,
"learning_rate": 2.892287234042553e-06,
"loss": 1.529,
"step": 1740
},
{
"epoch": 0.5817456524900791,
"grad_norm": 0.16218431293964386,
"learning_rate": 2.9089095744680852e-06,
"loss": 1.505,
"step": 1750
},
{
"epoch": 0.5850699133614511,
"grad_norm": 0.16082778573036194,
"learning_rate": 2.9255319148936174e-06,
"loss": 1.5312,
"step": 1760
},
{
"epoch": 0.5883941742328229,
"grad_norm": 0.19500340521335602,
"learning_rate": 2.942154255319149e-06,
"loss": 1.4971,
"step": 1770
},
{
"epoch": 0.5917184351041948,
"grad_norm": 0.16831324994564056,
"learning_rate": 2.9587765957446807e-06,
"loss": 1.5172,
"step": 1780
},
{
"epoch": 0.5950426959755667,
"grad_norm": 0.17963413894176483,
"learning_rate": 2.975398936170213e-06,
"loss": 1.5076,
"step": 1790
},
{
"epoch": 0.5983669568469385,
"grad_norm": 0.17123515903949738,
"learning_rate": 2.992021276595745e-06,
"loss": 1.4941,
"step": 1800
},
{
"epoch": 0.6016912177183105,
"grad_norm": 0.15727902948856354,
"learning_rate": 3.0086436170212766e-06,
"loss": 1.4609,
"step": 1810
},
{
"epoch": 0.6050154785896823,
"grad_norm": 0.1833077073097229,
"learning_rate": 3.0252659574468088e-06,
"loss": 1.5042,
"step": 1820
},
{
"epoch": 0.6083397394610542,
"grad_norm": 0.16962528228759766,
"learning_rate": 3.041888297872341e-06,
"loss": 1.4651,
"step": 1830
},
{
"epoch": 0.6116640003324261,
"grad_norm": 0.17829731106758118,
"learning_rate": 3.0585106382978726e-06,
"loss": 1.4907,
"step": 1840
},
{
"epoch": 0.614988261203798,
"grad_norm": 0.16981306672096252,
"learning_rate": 3.0751329787234042e-06,
"loss": 1.4683,
"step": 1850
},
{
"epoch": 0.6183125220751698,
"grad_norm": 0.20783671736717224,
"learning_rate": 3.0917553191489363e-06,
"loss": 1.463,
"step": 1860
},
{
"epoch": 0.6216367829465417,
"grad_norm": 0.20343361794948578,
"learning_rate": 3.108377659574468e-06,
"loss": 1.4632,
"step": 1870
},
{
"epoch": 0.6249610438179136,
"grad_norm": 0.18592675030231476,
"learning_rate": 3.125e-06,
"loss": 1.4887,
"step": 1880
},
{
"epoch": 0.6282853046892855,
"grad_norm": 0.17272701859474182,
"learning_rate": 3.141622340425532e-06,
"loss": 1.4491,
"step": 1890
},
{
"epoch": 0.6316095655606574,
"grad_norm": 0.2021792083978653,
"learning_rate": 3.1582446808510644e-06,
"loss": 1.4537,
"step": 1900
},
{
"epoch": 0.6349338264320292,
"grad_norm": 0.16319766640663147,
"learning_rate": 3.174867021276596e-06,
"loss": 1.456,
"step": 1910
},
{
"epoch": 0.6382580873034012,
"grad_norm": 0.2344328761100769,
"learning_rate": 3.1914893617021277e-06,
"loss": 1.4801,
"step": 1920
},
{
"epoch": 0.641582348174773,
"grad_norm": 0.17495407164096832,
"learning_rate": 3.20811170212766e-06,
"loss": 1.4435,
"step": 1930
},
{
"epoch": 0.6449066090461449,
"grad_norm": 0.19222399592399597,
"learning_rate": 3.2247340425531915e-06,
"loss": 1.4391,
"step": 1940
},
{
"epoch": 0.6482308699175168,
"grad_norm": 0.24526530504226685,
"learning_rate": 3.2413563829787232e-06,
"loss": 1.4555,
"step": 1950
},
{
"epoch": 0.6515551307888886,
"grad_norm": 0.18150673806667328,
"learning_rate": 3.2579787234042558e-06,
"loss": 1.4396,
"step": 1960
},
{
"epoch": 0.6548793916602605,
"grad_norm": 0.18334811925888062,
"learning_rate": 3.2746010638297875e-06,
"loss": 1.4139,
"step": 1970
},
{
"epoch": 0.6582036525316324,
"grad_norm": 0.25186312198638916,
"learning_rate": 3.291223404255319e-06,
"loss": 1.439,
"step": 1980
},
{
"epoch": 0.6615279134030043,
"grad_norm": 0.16558600962162018,
"learning_rate": 3.3078457446808513e-06,
"loss": 1.4383,
"step": 1990
},
{
"epoch": 0.6648521742743762,
"grad_norm": 0.2373538315296173,
"learning_rate": 3.324468085106383e-06,
"loss": 1.4334,
"step": 2000
},
{
"epoch": 0.6681764351457481,
"grad_norm": 0.2821474075317383,
"learning_rate": 3.3410904255319146e-06,
"loss": 1.4418,
"step": 2010
},
{
"epoch": 0.6715006960171199,
"grad_norm": 0.2443741410970688,
"learning_rate": 3.357712765957447e-06,
"loss": 1.4071,
"step": 2020
},
{
"epoch": 0.6748249568884919,
"grad_norm": 0.17468735575675964,
"learning_rate": 3.374335106382979e-06,
"loss": 1.4109,
"step": 2030
},
{
"epoch": 0.6781492177598637,
"grad_norm": 0.1655045598745346,
"learning_rate": 3.3909574468085105e-06,
"loss": 1.4049,
"step": 2040
},
{
"epoch": 0.6814734786312356,
"grad_norm": 0.17598801851272583,
"learning_rate": 3.4075797872340427e-06,
"loss": 1.4188,
"step": 2050
},
{
"epoch": 0.6847977395026075,
"grad_norm": 0.28528669476509094,
"learning_rate": 3.4242021276595743e-06,
"loss": 1.408,
"step": 2060
},
{
"epoch": 0.6881220003739793,
"grad_norm": 0.17654620110988617,
"learning_rate": 3.440824468085106e-06,
"loss": 1.4117,
"step": 2070
},
{
"epoch": 0.6914462612453512,
"grad_norm": 0.2636467516422272,
"learning_rate": 3.4574468085106386e-06,
"loss": 1.3947,
"step": 2080
},
{
"epoch": 0.6947705221167231,
"grad_norm": 0.26495933532714844,
"learning_rate": 3.4740691489361703e-06,
"loss": 1.398,
"step": 2090
},
{
"epoch": 0.698094782988095,
"grad_norm": 0.3873574435710907,
"learning_rate": 3.490691489361702e-06,
"loss": 1.4204,
"step": 2100
},
{
"epoch": 0.7014190438594669,
"grad_norm": 0.327854186296463,
"learning_rate": 3.5073138297872345e-06,
"loss": 1.3744,
"step": 2110
},
{
"epoch": 0.7047433047308388,
"grad_norm": 0.308570921421051,
"learning_rate": 3.523936170212766e-06,
"loss": 1.4293,
"step": 2120
},
{
"epoch": 0.7080675656022106,
"grad_norm": 0.21123336255550385,
"learning_rate": 3.5405585106382983e-06,
"loss": 1.3878,
"step": 2130
},
{
"epoch": 0.7113918264735826,
"grad_norm": 0.18777534365653992,
"learning_rate": 3.55718085106383e-06,
"loss": 1.3882,
"step": 2140
},
{
"epoch": 0.7147160873449544,
"grad_norm": 0.2535350024700165,
"learning_rate": 3.5738031914893617e-06,
"loss": 1.3974,
"step": 2150
},
{
"epoch": 0.7180403482163262,
"grad_norm": 0.15405435860157013,
"learning_rate": 3.590425531914894e-06,
"loss": 1.3853,
"step": 2160
},
{
"epoch": 0.7213646090876982,
"grad_norm": 0.1863648146390915,
"learning_rate": 3.607047872340426e-06,
"loss": 1.3835,
"step": 2170
},
{
"epoch": 0.72468886995907,
"grad_norm": 0.18587157130241394,
"learning_rate": 3.6236702127659576e-06,
"loss": 1.3711,
"step": 2180
},
{
"epoch": 0.728013130830442,
"grad_norm": 0.18254730105400085,
"learning_rate": 3.6402925531914897e-06,
"loss": 1.3768,
"step": 2190
},
{
"epoch": 0.7313373917018138,
"grad_norm": 0.21665969491004944,
"learning_rate": 3.6569148936170214e-06,
"loss": 1.3638,
"step": 2200
},
{
"epoch": 0.7346616525731857,
"grad_norm": 0.15701924264431,
"learning_rate": 3.673537234042553e-06,
"loss": 1.3885,
"step": 2210
},
{
"epoch": 0.7379859134445576,
"grad_norm": 0.19307725131511688,
"learning_rate": 3.6901595744680856e-06,
"loss": 1.3933,
"step": 2220
},
{
"epoch": 0.7413101743159295,
"grad_norm": 0.16837100684642792,
"learning_rate": 3.7067819148936173e-06,
"loss": 1.3685,
"step": 2230
},
{
"epoch": 0.7446344351873013,
"grad_norm": 0.2914402484893799,
"learning_rate": 3.723404255319149e-06,
"loss": 1.3802,
"step": 2240
},
{
"epoch": 0.7479586960586732,
"grad_norm": 0.2770545184612274,
"learning_rate": 3.7400265957446815e-06,
"loss": 1.3575,
"step": 2250
},
{
"epoch": 0.7512829569300451,
"grad_norm": 0.19819234311580658,
"learning_rate": 3.756648936170213e-06,
"loss": 1.3695,
"step": 2260
},
{
"epoch": 0.7546072178014169,
"grad_norm": 0.15371359884738922,
"learning_rate": 3.7732712765957445e-06,
"loss": 1.3514,
"step": 2270
},
{
"epoch": 0.7579314786727889,
"grad_norm": 0.26700448989868164,
"learning_rate": 3.789893617021277e-06,
"loss": 1.3689,
"step": 2280
},
{
"epoch": 0.7612557395441607,
"grad_norm": 0.2938506007194519,
"learning_rate": 3.8065159574468087e-06,
"loss": 1.3518,
"step": 2290
},
{
"epoch": 0.7645800004155326,
"grad_norm": 0.2514606714248657,
"learning_rate": 3.823138297872341e-06,
"loss": 1.3655,
"step": 2300
},
{
"epoch": 0.7679042612869045,
"grad_norm": 0.2503184378147125,
"learning_rate": 3.8397606382978725e-06,
"loss": 1.3511,
"step": 2310
},
{
"epoch": 0.7712285221582764,
"grad_norm": 0.1815042346715927,
"learning_rate": 3.856382978723404e-06,
"loss": 1.383,
"step": 2320
},
{
"epoch": 0.7745527830296483,
"grad_norm": 0.25425419211387634,
"learning_rate": 3.873005319148936e-06,
"loss": 1.3354,
"step": 2330
},
{
"epoch": 0.7778770439010201,
"grad_norm": 0.18466657400131226,
"learning_rate": 3.889627659574468e-06,
"loss": 1.3514,
"step": 2340
},
{
"epoch": 0.781201304772392,
"grad_norm": 0.1782332807779312,
"learning_rate": 3.90625e-06,
"loss": 1.32,
"step": 2350
},
{
"epoch": 0.7845255656437639,
"grad_norm": 0.27637991309165955,
"learning_rate": 3.922872340425532e-06,
"loss": 1.3383,
"step": 2360
},
{
"epoch": 0.7878498265151358,
"grad_norm": 0.17314772307872772,
"learning_rate": 3.939494680851064e-06,
"loss": 1.3314,
"step": 2370
},
{
"epoch": 0.7911740873865076,
"grad_norm": 0.3641667068004608,
"learning_rate": 3.956117021276596e-06,
"loss": 1.3543,
"step": 2380
},
{
"epoch": 0.7944983482578796,
"grad_norm": 0.3088253438472748,
"learning_rate": 3.972739361702128e-06,
"loss": 1.3444,
"step": 2390
},
{
"epoch": 0.7978226091292514,
"grad_norm": 0.25276973843574524,
"learning_rate": 3.98936170212766e-06,
"loss": 1.3102,
"step": 2400
},
{
"epoch": 0.8011468700006233,
"grad_norm": 0.26414382457733154,
"learning_rate": 4.005984042553192e-06,
"loss": 1.3119,
"step": 2410
},
{
"epoch": 0.8044711308719952,
"grad_norm": 0.1684638261795044,
"learning_rate": 4.022606382978724e-06,
"loss": 1.3204,
"step": 2420
},
{
"epoch": 0.807795391743367,
"grad_norm": 0.18500946462154388,
"learning_rate": 4.039228723404256e-06,
"loss": 1.3251,
"step": 2430
},
{
"epoch": 0.811119652614739,
"grad_norm": 0.2754835784435272,
"learning_rate": 4.055851063829787e-06,
"loss": 1.3258,
"step": 2440
},
{
"epoch": 0.8144439134861108,
"grad_norm": 0.18949855864048004,
"learning_rate": 4.072473404255319e-06,
"loss": 1.3145,
"step": 2450
},
{
"epoch": 0.8177681743574827,
"grad_norm": 0.6927218437194824,
"learning_rate": 4.089095744680851e-06,
"loss": 1.3205,
"step": 2460
},
{
"epoch": 0.8210924352288546,
"grad_norm": 0.36098670959472656,
"learning_rate": 4.105718085106383e-06,
"loss": 1.3295,
"step": 2470
},
{
"epoch": 0.8244166961002265,
"grad_norm": 0.25839686393737793,
"learning_rate": 4.1223404255319146e-06,
"loss": 1.321,
"step": 2480
},
{
"epoch": 0.8277409569715983,
"grad_norm": 0.18720127642154694,
"learning_rate": 4.138962765957447e-06,
"loss": 1.2975,
"step": 2490
},
{
"epoch": 0.8310652178429703,
"grad_norm": 0.17975495755672455,
"learning_rate": 4.155585106382979e-06,
"loss": 1.318,
"step": 2500
}
],
"logging_steps": 10,
"max_steps": 150400,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4430265344e+18,
"train_batch_size": 5,
"trial_name": null,
"trial_params": null
}