{ "best_metric": 0.0710952952504158, "best_model_checkpoint": "./vit-base-trash-demo-v5/checkpoint-4000", "epoch": 4.0, "eval_steps": 1000, "global_step": 4476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008936550491510277, "grad_norm": 2.2266733646392822, "learning_rate": 0.00019955317247542448, "loss": 1.7283, "step": 10 }, { "epoch": 0.017873100983020553, "grad_norm": 1.7498841285705566, "learning_rate": 0.000199106344950849, "loss": 1.2864, "step": 20 }, { "epoch": 0.02680965147453083, "grad_norm": 2.307732582092285, "learning_rate": 0.00019865951742627347, "loss": 0.9102, "step": 30 }, { "epoch": 0.035746201966041107, "grad_norm": 4.274341106414795, "learning_rate": 0.00019821268990169794, "loss": 0.7496, "step": 40 }, { "epoch": 0.044682752457551385, "grad_norm": 2.586291551589966, "learning_rate": 0.00019776586237712246, "loss": 0.7103, "step": 50 }, { "epoch": 0.05361930294906166, "grad_norm": 1.1641569137573242, "learning_rate": 0.00019731903485254693, "loss": 0.5292, "step": 60 }, { "epoch": 0.06255585344057193, "grad_norm": 5.71007776260376, "learning_rate": 0.0001968722073279714, "loss": 0.5169, "step": 70 }, { "epoch": 0.07149240393208221, "grad_norm": 4.907001495361328, "learning_rate": 0.0001964253798033959, "loss": 0.5468, "step": 80 }, { "epoch": 0.08042895442359249, "grad_norm": 4.788897514343262, "learning_rate": 0.00019597855227882039, "loss": 0.5174, "step": 90 }, { "epoch": 0.08936550491510277, "grad_norm": 3.1065032482147217, "learning_rate": 0.00019553172475424485, "loss": 0.4598, "step": 100 }, { "epoch": 0.09830205540661305, "grad_norm": 2.299734115600586, "learning_rate": 0.00019508489722966935, "loss": 0.3711, "step": 110 }, { "epoch": 0.10723860589812333, "grad_norm": 1.3453820943832397, "learning_rate": 0.00019463806970509384, "loss": 0.4316, "step": 120 }, { "epoch": 0.1161751563896336, "grad_norm": 2.797497034072876, "learning_rate": 0.00019419124218051834, "loss": 0.5008, "step": 130 }, { "epoch": 0.12511170688114387, "grad_norm": 3.164219856262207, "learning_rate": 0.0001937444146559428, "loss": 0.371, "step": 140 }, { "epoch": 0.13404825737265416, "grad_norm": 5.14756965637207, "learning_rate": 0.0001932975871313673, "loss": 0.5617, "step": 150 }, { "epoch": 0.14298480786416443, "grad_norm": 1.478502869606018, "learning_rate": 0.0001928507596067918, "loss": 0.3317, "step": 160 }, { "epoch": 0.15192135835567472, "grad_norm": 1.3914997577667236, "learning_rate": 0.00019240393208221627, "loss": 0.3637, "step": 170 }, { "epoch": 0.16085790884718498, "grad_norm": 2.963843822479248, "learning_rate": 0.00019195710455764076, "loss": 0.3628, "step": 180 }, { "epoch": 0.16979445933869527, "grad_norm": 4.369801044464111, "learning_rate": 0.00019151027703306526, "loss": 0.4182, "step": 190 }, { "epoch": 0.17873100983020554, "grad_norm": 2.109142541885376, "learning_rate": 0.00019106344950848973, "loss": 0.279, "step": 200 }, { "epoch": 0.1876675603217158, "grad_norm": 5.093620777130127, "learning_rate": 0.00019061662198391422, "loss": 0.4936, "step": 210 }, { "epoch": 0.1966041108132261, "grad_norm": 2.2940657138824463, "learning_rate": 0.00019016979445933872, "loss": 0.532, "step": 220 }, { "epoch": 0.20554066130473636, "grad_norm": 3.415463924407959, "learning_rate": 0.00018972296693476319, "loss": 0.3475, "step": 230 }, { "epoch": 0.21447721179624665, "grad_norm": 1.158109426498413, "learning_rate": 0.00018927613941018768, "loss": 0.386, "step": 240 }, { "epoch": 0.22341376228775692, "grad_norm": 0.44666406512260437, "learning_rate": 0.00018882931188561218, "loss": 0.4165, "step": 250 }, { "epoch": 0.2323503127792672, "grad_norm": 2.626112461090088, "learning_rate": 0.00018838248436103664, "loss": 0.3478, "step": 260 }, { "epoch": 0.24128686327077747, "grad_norm": 3.9105656147003174, "learning_rate": 0.00018793565683646114, "loss": 0.293, "step": 270 }, { "epoch": 0.25022341376228774, "grad_norm": 6.075140953063965, "learning_rate": 0.00018748882931188563, "loss": 0.4013, "step": 280 }, { "epoch": 0.25915996425379806, "grad_norm": 1.2540974617004395, "learning_rate": 0.0001870420017873101, "loss": 0.5419, "step": 290 }, { "epoch": 0.2680965147453083, "grad_norm": 2.7805850505828857, "learning_rate": 0.00018659517426273457, "loss": 0.373, "step": 300 }, { "epoch": 0.2770330652368186, "grad_norm": 1.9444429874420166, "learning_rate": 0.0001861483467381591, "loss": 0.2447, "step": 310 }, { "epoch": 0.28596961572832885, "grad_norm": 2.164452075958252, "learning_rate": 0.00018570151921358356, "loss": 0.4141, "step": 320 }, { "epoch": 0.2949061662198391, "grad_norm": 4.439435958862305, "learning_rate": 0.00018525469168900803, "loss": 0.3183, "step": 330 }, { "epoch": 0.30384271671134944, "grad_norm": 3.332730770111084, "learning_rate": 0.00018480786416443255, "loss": 0.3255, "step": 340 }, { "epoch": 0.3127792672028597, "grad_norm": 4.461299419403076, "learning_rate": 0.00018436103663985702, "loss": 0.2733, "step": 350 }, { "epoch": 0.32171581769436997, "grad_norm": 4.637039661407471, "learning_rate": 0.0001839142091152815, "loss": 0.2486, "step": 360 }, { "epoch": 0.33065236818588023, "grad_norm": 1.985630989074707, "learning_rate": 0.000183467381590706, "loss": 0.2978, "step": 370 }, { "epoch": 0.33958891867739055, "grad_norm": 2.7530932426452637, "learning_rate": 0.00018302055406613048, "loss": 0.3722, "step": 380 }, { "epoch": 0.3485254691689008, "grad_norm": 0.20281237363815308, "learning_rate": 0.00018257372654155497, "loss": 0.3897, "step": 390 }, { "epoch": 0.3574620196604111, "grad_norm": 1.9220471382141113, "learning_rate": 0.00018212689901697947, "loss": 0.2564, "step": 400 }, { "epoch": 0.36639857015192134, "grad_norm": 4.841084957122803, "learning_rate": 0.00018168007149240394, "loss": 0.3892, "step": 410 }, { "epoch": 0.3753351206434316, "grad_norm": 5.499583721160889, "learning_rate": 0.00018123324396782843, "loss": 0.3462, "step": 420 }, { "epoch": 0.38427167113494193, "grad_norm": 1.4320725202560425, "learning_rate": 0.0001807864164432529, "loss": 0.2658, "step": 430 }, { "epoch": 0.3932082216264522, "grad_norm": 2.9739625453948975, "learning_rate": 0.0001803395889186774, "loss": 0.269, "step": 440 }, { "epoch": 0.40214477211796246, "grad_norm": 2.0247411727905273, "learning_rate": 0.0001798927613941019, "loss": 0.3355, "step": 450 }, { "epoch": 0.4110813226094727, "grad_norm": 0.25784507393836975, "learning_rate": 0.00017944593386952636, "loss": 0.2589, "step": 460 }, { "epoch": 0.42001787310098304, "grad_norm": 8.2475004196167, "learning_rate": 0.00017899910634495086, "loss": 0.3034, "step": 470 }, { "epoch": 0.4289544235924933, "grad_norm": 2.1054959297180176, "learning_rate": 0.00017855227882037535, "loss": 0.3784, "step": 480 }, { "epoch": 0.43789097408400357, "grad_norm": 2.6620118618011475, "learning_rate": 0.00017810545129579982, "loss": 0.3303, "step": 490 }, { "epoch": 0.44682752457551383, "grad_norm": 2.4308674335479736, "learning_rate": 0.00017765862377122431, "loss": 0.4428, "step": 500 }, { "epoch": 0.45576407506702415, "grad_norm": 4.620336532592773, "learning_rate": 0.0001772117962466488, "loss": 0.4007, "step": 510 }, { "epoch": 0.4647006255585344, "grad_norm": 0.6208035945892334, "learning_rate": 0.00017676496872207328, "loss": 0.4641, "step": 520 }, { "epoch": 0.4736371760500447, "grad_norm": 3.478276252746582, "learning_rate": 0.00017631814119749777, "loss": 0.3107, "step": 530 }, { "epoch": 0.48257372654155495, "grad_norm": 2.8934295177459717, "learning_rate": 0.00017587131367292227, "loss": 0.2283, "step": 540 }, { "epoch": 0.4915102770330652, "grad_norm": 2.323265552520752, "learning_rate": 0.00017542448614834674, "loss": 0.253, "step": 550 }, { "epoch": 0.5004468275245755, "grad_norm": 0.8551294207572937, "learning_rate": 0.00017497765862377123, "loss": 0.2903, "step": 560 }, { "epoch": 0.5093833780160858, "grad_norm": 3.454586982727051, "learning_rate": 0.00017453083109919573, "loss": 0.2434, "step": 570 }, { "epoch": 0.5183199285075961, "grad_norm": 4.937902927398682, "learning_rate": 0.0001740840035746202, "loss": 0.254, "step": 580 }, { "epoch": 0.5272564789991063, "grad_norm": 7.016292095184326, "learning_rate": 0.0001736371760500447, "loss": 0.3365, "step": 590 }, { "epoch": 0.5361930294906166, "grad_norm": 4.799339294433594, "learning_rate": 0.0001731903485254692, "loss": 0.2639, "step": 600 }, { "epoch": 0.5451295799821269, "grad_norm": 3.4884583950042725, "learning_rate": 0.00017274352100089365, "loss": 0.2255, "step": 610 }, { "epoch": 0.5540661304736372, "grad_norm": 1.6748002767562866, "learning_rate": 0.00017229669347631815, "loss": 0.2678, "step": 620 }, { "epoch": 0.5630026809651475, "grad_norm": 4.145753383636475, "learning_rate": 0.00017184986595174265, "loss": 0.3435, "step": 630 }, { "epoch": 0.5719392314566577, "grad_norm": 3.8941946029663086, "learning_rate": 0.00017140303842716711, "loss": 0.3448, "step": 640 }, { "epoch": 0.580875781948168, "grad_norm": 2.7980730533599854, "learning_rate": 0.0001709562109025916, "loss": 0.1776, "step": 650 }, { "epoch": 0.5898123324396782, "grad_norm": 2.6846330165863037, "learning_rate": 0.0001705093833780161, "loss": 0.1698, "step": 660 }, { "epoch": 0.5987488829311886, "grad_norm": 0.6026754379272461, "learning_rate": 0.00017006255585344057, "loss": 0.2683, "step": 670 }, { "epoch": 0.6076854334226989, "grad_norm": 1.7795771360397339, "learning_rate": 0.00016961572832886507, "loss": 0.1734, "step": 680 }, { "epoch": 0.6166219839142091, "grad_norm": 2.9793999195098877, "learning_rate": 0.00016916890080428956, "loss": 0.2341, "step": 690 }, { "epoch": 0.6255585344057194, "grad_norm": 2.885993480682373, "learning_rate": 0.00016872207327971403, "loss": 0.1787, "step": 700 }, { "epoch": 0.6344950848972297, "grad_norm": 0.12324349582195282, "learning_rate": 0.00016827524575513853, "loss": 0.1716, "step": 710 }, { "epoch": 0.6434316353887399, "grad_norm": 3.5133144855499268, "learning_rate": 0.00016782841823056302, "loss": 0.248, "step": 720 }, { "epoch": 0.6523681858802503, "grad_norm": 0.16779197752475739, "learning_rate": 0.0001673815907059875, "loss": 0.1969, "step": 730 }, { "epoch": 0.6613047363717605, "grad_norm": 6.696399688720703, "learning_rate": 0.00016693476318141199, "loss": 0.2659, "step": 740 }, { "epoch": 0.6702412868632708, "grad_norm": 3.4363462924957275, "learning_rate": 0.00016648793565683648, "loss": 0.1933, "step": 750 }, { "epoch": 0.6791778373547811, "grad_norm": 2.9766454696655273, "learning_rate": 0.00016604110813226095, "loss": 0.3646, "step": 760 }, { "epoch": 0.6881143878462913, "grad_norm": 3.3751492500305176, "learning_rate": 0.00016559428060768544, "loss": 0.2365, "step": 770 }, { "epoch": 0.6970509383378016, "grad_norm": 1.6829200983047485, "learning_rate": 0.00016514745308310994, "loss": 0.3954, "step": 780 }, { "epoch": 0.7059874888293118, "grad_norm": 3.473019599914551, "learning_rate": 0.0001647006255585344, "loss": 0.2257, "step": 790 }, { "epoch": 0.7149240393208222, "grad_norm": 2.1625287532806396, "learning_rate": 0.0001642537980339589, "loss": 0.2984, "step": 800 }, { "epoch": 0.7238605898123325, "grad_norm": 1.2086807489395142, "learning_rate": 0.00016380697050938337, "loss": 0.2888, "step": 810 }, { "epoch": 0.7327971403038427, "grad_norm": 0.19183319807052612, "learning_rate": 0.00016336014298480787, "loss": 0.2884, "step": 820 }, { "epoch": 0.741733690795353, "grad_norm": 4.687781810760498, "learning_rate": 0.00016291331546023236, "loss": 0.2456, "step": 830 }, { "epoch": 0.7506702412868632, "grad_norm": 1.6150999069213867, "learning_rate": 0.00016246648793565683, "loss": 0.2231, "step": 840 }, { "epoch": 0.7596067917783735, "grad_norm": 2.592801809310913, "learning_rate": 0.00016201966041108133, "loss": 0.2425, "step": 850 }, { "epoch": 0.7685433422698839, "grad_norm": 5.782228469848633, "learning_rate": 0.00016157283288650582, "loss": 0.2184, "step": 860 }, { "epoch": 0.7774798927613941, "grad_norm": 4.794034957885742, "learning_rate": 0.0001611260053619303, "loss": 0.1862, "step": 870 }, { "epoch": 0.7864164432529044, "grad_norm": 6.517756462097168, "learning_rate": 0.0001606791778373548, "loss": 0.3502, "step": 880 }, { "epoch": 0.7953529937444147, "grad_norm": 6.479066848754883, "learning_rate": 0.00016023235031277928, "loss": 0.1433, "step": 890 }, { "epoch": 0.8042895442359249, "grad_norm": 1.539117455482483, "learning_rate": 0.00015978552278820375, "loss": 0.3306, "step": 900 }, { "epoch": 0.8132260947274352, "grad_norm": 2.0679945945739746, "learning_rate": 0.00015933869526362827, "loss": 0.2325, "step": 910 }, { "epoch": 0.8221626452189454, "grad_norm": 4.1405558586120605, "learning_rate": 0.00015889186773905274, "loss": 0.2126, "step": 920 }, { "epoch": 0.8310991957104558, "grad_norm": 3.7805371284484863, "learning_rate": 0.0001584450402144772, "loss": 0.2418, "step": 930 }, { "epoch": 0.8400357462019661, "grad_norm": 4.6036248207092285, "learning_rate": 0.0001579982126899017, "loss": 0.3191, "step": 940 }, { "epoch": 0.8489722966934763, "grad_norm": 0.8650698661804199, "learning_rate": 0.0001575513851653262, "loss": 0.0849, "step": 950 }, { "epoch": 0.8579088471849866, "grad_norm": 0.4226575791835785, "learning_rate": 0.00015710455764075067, "loss": 0.1689, "step": 960 }, { "epoch": 0.8668453976764968, "grad_norm": 4.508443355560303, "learning_rate": 0.00015665773011617516, "loss": 0.1381, "step": 970 }, { "epoch": 0.8757819481680071, "grad_norm": 5.323261260986328, "learning_rate": 0.00015621090259159966, "loss": 0.1799, "step": 980 }, { "epoch": 0.8847184986595175, "grad_norm": 4.80311393737793, "learning_rate": 0.00015576407506702412, "loss": 0.1616, "step": 990 }, { "epoch": 0.8936550491510277, "grad_norm": 2.0073227882385254, "learning_rate": 0.00015531724754244862, "loss": 0.1814, "step": 1000 }, { "epoch": 0.8936550491510277, "eval_accuracy": 0.9487437185929648, "eval_loss": 0.17145079374313354, "eval_runtime": 56.2937, "eval_samples_per_second": 35.35, "eval_steps_per_second": 4.423, "step": 1000 }, { "epoch": 0.902591599642538, "grad_norm": 3.7729084491729736, "learning_rate": 0.00015487042001787312, "loss": 0.1352, "step": 1010 }, { "epoch": 0.9115281501340483, "grad_norm": 6.341058254241943, "learning_rate": 0.00015442359249329758, "loss": 0.2095, "step": 1020 }, { "epoch": 0.9204647006255585, "grad_norm": 0.42624279856681824, "learning_rate": 0.00015397676496872208, "loss": 0.2741, "step": 1030 }, { "epoch": 0.9294012511170688, "grad_norm": 4.386059761047363, "learning_rate": 0.00015352993744414657, "loss": 0.1688, "step": 1040 }, { "epoch": 0.938337801608579, "grad_norm": 0.07118342816829681, "learning_rate": 0.00015308310991957104, "loss": 0.2251, "step": 1050 }, { "epoch": 0.9472743521000894, "grad_norm": 1.6763445138931274, "learning_rate": 0.00015263628239499554, "loss": 0.2107, "step": 1060 }, { "epoch": 0.9562109025915997, "grad_norm": 5.161765098571777, "learning_rate": 0.00015218945487042003, "loss": 0.1552, "step": 1070 }, { "epoch": 0.9651474530831099, "grad_norm": 1.8887020349502563, "learning_rate": 0.0001517426273458445, "loss": 0.259, "step": 1080 }, { "epoch": 0.9740840035746202, "grad_norm": 0.29733049869537354, "learning_rate": 0.000151295799821269, "loss": 0.1136, "step": 1090 }, { "epoch": 0.9830205540661304, "grad_norm": 3.253506660461426, "learning_rate": 0.0001508489722966935, "loss": 0.1211, "step": 1100 }, { "epoch": 0.9919571045576407, "grad_norm": 2.1613495349884033, "learning_rate": 0.00015040214477211796, "loss": 0.1925, "step": 1110 }, { "epoch": 1.000893655049151, "grad_norm": 0.23403897881507874, "learning_rate": 0.00014995531724754246, "loss": 0.1099, "step": 1120 }, { "epoch": 1.0098302055406614, "grad_norm": 0.7746050357818604, "learning_rate": 0.00014950848972296695, "loss": 0.0627, "step": 1130 }, { "epoch": 1.0187667560321716, "grad_norm": 0.3406558930873871, "learning_rate": 0.00014906166219839145, "loss": 0.063, "step": 1140 }, { "epoch": 1.0277033065236818, "grad_norm": 0.12071269750595093, "learning_rate": 0.00014861483467381591, "loss": 0.1432, "step": 1150 }, { "epoch": 1.0366398570151922, "grad_norm": 0.4978802800178528, "learning_rate": 0.00014816800714924038, "loss": 0.0746, "step": 1160 }, { "epoch": 1.0455764075067024, "grad_norm": 1.3803187608718872, "learning_rate": 0.0001477211796246649, "loss": 0.0929, "step": 1170 }, { "epoch": 1.0545129579982127, "grad_norm": 0.8612852692604065, "learning_rate": 0.00014727435210008937, "loss": 0.0949, "step": 1180 }, { "epoch": 1.063449508489723, "grad_norm": 0.3493196666240692, "learning_rate": 0.00014682752457551384, "loss": 0.168, "step": 1190 }, { "epoch": 1.0723860589812333, "grad_norm": 0.18564535677433014, "learning_rate": 0.00014638069705093836, "loss": 0.1344, "step": 1200 }, { "epoch": 1.0813226094727435, "grad_norm": 5.853936672210693, "learning_rate": 0.00014593386952636283, "loss": 0.0874, "step": 1210 }, { "epoch": 1.0902591599642537, "grad_norm": 5.366926670074463, "learning_rate": 0.0001454870420017873, "loss": 0.0847, "step": 1220 }, { "epoch": 1.0991957104557641, "grad_norm": 0.04646310582756996, "learning_rate": 0.00014504021447721182, "loss": 0.0947, "step": 1230 }, { "epoch": 1.1081322609472744, "grad_norm": 0.7593271732330322, "learning_rate": 0.0001445933869526363, "loss": 0.1208, "step": 1240 }, { "epoch": 1.1170688114387846, "grad_norm": 6.3901472091674805, "learning_rate": 0.00014414655942806076, "loss": 0.0977, "step": 1250 }, { "epoch": 1.126005361930295, "grad_norm": 1.7897100448608398, "learning_rate": 0.00014369973190348528, "loss": 0.1107, "step": 1260 }, { "epoch": 1.1349419124218052, "grad_norm": 3.0502521991729736, "learning_rate": 0.00014325290437890975, "loss": 0.1854, "step": 1270 }, { "epoch": 1.1438784629133154, "grad_norm": 0.18816685676574707, "learning_rate": 0.00014280607685433422, "loss": 0.0863, "step": 1280 }, { "epoch": 1.1528150134048256, "grad_norm": 0.05062058940529823, "learning_rate": 0.0001423592493297587, "loss": 0.1339, "step": 1290 }, { "epoch": 1.161751563896336, "grad_norm": 0.23230992257595062, "learning_rate": 0.0001419124218051832, "loss": 0.187, "step": 1300 }, { "epoch": 1.1706881143878463, "grad_norm": 3.0492892265319824, "learning_rate": 0.00014146559428060768, "loss": 0.0701, "step": 1310 }, { "epoch": 1.1796246648793565, "grad_norm": 0.03424559161067009, "learning_rate": 0.00014101876675603217, "loss": 0.0428, "step": 1320 }, { "epoch": 1.188561215370867, "grad_norm": 3.6026527881622314, "learning_rate": 0.00014057193923145667, "loss": 0.1337, "step": 1330 }, { "epoch": 1.197497765862377, "grad_norm": 0.09644579142332077, "learning_rate": 0.00014012511170688114, "loss": 0.1271, "step": 1340 }, { "epoch": 1.2064343163538873, "grad_norm": 0.22322706878185272, "learning_rate": 0.00013967828418230563, "loss": 0.055, "step": 1350 }, { "epoch": 1.2153708668453977, "grad_norm": 0.05372155085206032, "learning_rate": 0.00013923145665773013, "loss": 0.1251, "step": 1360 }, { "epoch": 1.224307417336908, "grad_norm": 0.4156355857849121, "learning_rate": 0.00013878462913315462, "loss": 0.0405, "step": 1370 }, { "epoch": 1.2332439678284182, "grad_norm": 0.030704261735081673, "learning_rate": 0.0001383378016085791, "loss": 0.1225, "step": 1380 }, { "epoch": 1.2421805183199286, "grad_norm": 0.09552694112062454, "learning_rate": 0.00013789097408400359, "loss": 0.0495, "step": 1390 }, { "epoch": 1.2511170688114388, "grad_norm": 2.1124463081359863, "learning_rate": 0.00013744414655942808, "loss": 0.0651, "step": 1400 }, { "epoch": 1.260053619302949, "grad_norm": 4.6221232414245605, "learning_rate": 0.00013699731903485255, "loss": 0.2366, "step": 1410 }, { "epoch": 1.2689901697944594, "grad_norm": 0.054540861397981644, "learning_rate": 0.00013655049151027704, "loss": 0.1915, "step": 1420 }, { "epoch": 1.2779267202859697, "grad_norm": 0.6603236198425293, "learning_rate": 0.00013610366398570154, "loss": 0.0386, "step": 1430 }, { "epoch": 1.2868632707774799, "grad_norm": 4.419101715087891, "learning_rate": 0.000135656836461126, "loss": 0.1288, "step": 1440 }, { "epoch": 1.2957998212689903, "grad_norm": 1.6491079330444336, "learning_rate": 0.0001352100089365505, "loss": 0.077, "step": 1450 }, { "epoch": 1.3047363717605005, "grad_norm": 0.904062807559967, "learning_rate": 0.000134763181411975, "loss": 0.2083, "step": 1460 }, { "epoch": 1.3136729222520107, "grad_norm": 3.4404361248016357, "learning_rate": 0.00013431635388739947, "loss": 0.1846, "step": 1470 }, { "epoch": 1.322609472743521, "grad_norm": 0.2096666842699051, "learning_rate": 0.00013386952636282396, "loss": 0.0354, "step": 1480 }, { "epoch": 1.3315460232350314, "grad_norm": 4.2826128005981445, "learning_rate": 0.00013342269883824846, "loss": 0.1438, "step": 1490 }, { "epoch": 1.3404825737265416, "grad_norm": 4.742111682891846, "learning_rate": 0.00013297587131367293, "loss": 0.0994, "step": 1500 }, { "epoch": 1.3494191242180518, "grad_norm": 6.2931952476501465, "learning_rate": 0.0001325290437890974, "loss": 0.0754, "step": 1510 }, { "epoch": 1.358355674709562, "grad_norm": 1.523571491241455, "learning_rate": 0.00013208221626452192, "loss": 0.1283, "step": 1520 }, { "epoch": 1.3672922252010724, "grad_norm": 8.253166198730469, "learning_rate": 0.00013163538873994638, "loss": 0.1718, "step": 1530 }, { "epoch": 1.3762287756925826, "grad_norm": 2.4168646335601807, "learning_rate": 0.00013118856121537085, "loss": 0.1285, "step": 1540 }, { "epoch": 1.3851653261840928, "grad_norm": 4.069122314453125, "learning_rate": 0.00013074173369079537, "loss": 0.1165, "step": 1550 }, { "epoch": 1.3941018766756033, "grad_norm": 0.2789513170719147, "learning_rate": 0.00013029490616621984, "loss": 0.0795, "step": 1560 }, { "epoch": 1.4030384271671135, "grad_norm": 0.5609318017959595, "learning_rate": 0.0001298480786416443, "loss": 0.1187, "step": 1570 }, { "epoch": 1.4119749776586237, "grad_norm": 0.34373611211776733, "learning_rate": 0.00012940125111706883, "loss": 0.0872, "step": 1580 }, { "epoch": 1.420911528150134, "grad_norm": 4.596048355102539, "learning_rate": 0.0001289544235924933, "loss": 0.1354, "step": 1590 }, { "epoch": 1.4298480786416443, "grad_norm": 0.06107456609606743, "learning_rate": 0.00012850759606791777, "loss": 0.119, "step": 1600 }, { "epoch": 1.4387846291331545, "grad_norm": 0.08292512595653534, "learning_rate": 0.0001280607685433423, "loss": 0.1075, "step": 1610 }, { "epoch": 1.447721179624665, "grad_norm": 0.04113980755209923, "learning_rate": 0.00012761394101876676, "loss": 0.099, "step": 1620 }, { "epoch": 1.4566577301161752, "grad_norm": 3.1171679496765137, "learning_rate": 0.00012716711349419126, "loss": 0.0476, "step": 1630 }, { "epoch": 1.4655942806076854, "grad_norm": 0.03248828276991844, "learning_rate": 0.00012672028596961572, "loss": 0.1217, "step": 1640 }, { "epoch": 1.4745308310991958, "grad_norm": 0.14615251123905182, "learning_rate": 0.00012627345844504022, "loss": 0.0845, "step": 1650 }, { "epoch": 1.483467381590706, "grad_norm": 0.8569982647895813, "learning_rate": 0.00012582663092046471, "loss": 0.0933, "step": 1660 }, { "epoch": 1.4924039320822162, "grad_norm": 0.030800212174654007, "learning_rate": 0.00012537980339588918, "loss": 0.0555, "step": 1670 }, { "epoch": 1.5013404825737267, "grad_norm": 0.9634251594543457, "learning_rate": 0.00012493297587131368, "loss": 0.1249, "step": 1680 }, { "epoch": 1.5102770330652369, "grad_norm": 0.06999039649963379, "learning_rate": 0.00012448614834673817, "loss": 0.0727, "step": 1690 }, { "epoch": 1.519213583556747, "grad_norm": 0.0438673160970211, "learning_rate": 0.00012403932082216264, "loss": 0.0595, "step": 1700 }, { "epoch": 1.5281501340482575, "grad_norm": 0.030631419271230698, "learning_rate": 0.00012359249329758714, "loss": 0.0641, "step": 1710 }, { "epoch": 1.5370866845397675, "grad_norm": 0.09066120535135269, "learning_rate": 0.00012314566577301163, "loss": 0.0689, "step": 1720 }, { "epoch": 1.546023235031278, "grad_norm": 1.1478157043457031, "learning_rate": 0.0001226988382484361, "loss": 0.0427, "step": 1730 }, { "epoch": 1.5549597855227884, "grad_norm": 0.5382466912269592, "learning_rate": 0.0001222520107238606, "loss": 0.1211, "step": 1740 }, { "epoch": 1.5638963360142983, "grad_norm": 0.15291939675807953, "learning_rate": 0.00012180518319928509, "loss": 0.1934, "step": 1750 }, { "epoch": 1.5728328865058088, "grad_norm": 0.07158921658992767, "learning_rate": 0.00012135835567470957, "loss": 0.045, "step": 1760 }, { "epoch": 1.5817694369973192, "grad_norm": 1.416129469871521, "learning_rate": 0.00012091152815013404, "loss": 0.0822, "step": 1770 }, { "epoch": 1.5907059874888292, "grad_norm": 3.2841928005218506, "learning_rate": 0.00012046470062555855, "loss": 0.0685, "step": 1780 }, { "epoch": 1.5996425379803396, "grad_norm": 5.683614730834961, "learning_rate": 0.00012001787310098302, "loss": 0.1512, "step": 1790 }, { "epoch": 1.6085790884718498, "grad_norm": 0.054330743849277496, "learning_rate": 0.0001195710455764075, "loss": 0.1381, "step": 1800 }, { "epoch": 1.61751563896336, "grad_norm": 0.05368073284626007, "learning_rate": 0.00011912421805183201, "loss": 0.1118, "step": 1810 }, { "epoch": 1.6264521894548705, "grad_norm": 17.735898971557617, "learning_rate": 0.00011867739052725648, "loss": 0.1704, "step": 1820 }, { "epoch": 1.6353887399463807, "grad_norm": 3.4387574195861816, "learning_rate": 0.00011823056300268096, "loss": 0.1498, "step": 1830 }, { "epoch": 1.6443252904378909, "grad_norm": 3.4959723949432373, "learning_rate": 0.00011778373547810547, "loss": 0.0667, "step": 1840 }, { "epoch": 1.6532618409294013, "grad_norm": 1.4753037691116333, "learning_rate": 0.00011733690795352994, "loss": 0.0445, "step": 1850 }, { "epoch": 1.6621983914209115, "grad_norm": 0.24579989910125732, "learning_rate": 0.00011689008042895442, "loss": 0.0377, "step": 1860 }, { "epoch": 1.6711349419124217, "grad_norm": 3.813619375228882, "learning_rate": 0.00011644325290437891, "loss": 0.1004, "step": 1870 }, { "epoch": 1.6800714924039322, "grad_norm": 0.808028519153595, "learning_rate": 0.0001159964253798034, "loss": 0.0679, "step": 1880 }, { "epoch": 1.6890080428954424, "grad_norm": 0.277228444814682, "learning_rate": 0.0001155495978552279, "loss": 0.1096, "step": 1890 }, { "epoch": 1.6979445933869526, "grad_norm": 2.485595703125, "learning_rate": 0.00011510277033065237, "loss": 0.0738, "step": 1900 }, { "epoch": 1.706881143878463, "grad_norm": 0.35362759232521057, "learning_rate": 0.00011465594280607685, "loss": 0.0807, "step": 1910 }, { "epoch": 1.7158176943699732, "grad_norm": 1.7707135677337646, "learning_rate": 0.00011420911528150135, "loss": 0.0603, "step": 1920 }, { "epoch": 1.7247542448614834, "grad_norm": 0.010053984820842743, "learning_rate": 0.00011376228775692583, "loss": 0.0142, "step": 1930 }, { "epoch": 1.7336907953529939, "grad_norm": 11.442891120910645, "learning_rate": 0.00011331546023235031, "loss": 0.0509, "step": 1940 }, { "epoch": 1.742627345844504, "grad_norm": 2.5633316040039062, "learning_rate": 0.00011286863270777481, "loss": 0.0204, "step": 1950 }, { "epoch": 1.7515638963360143, "grad_norm": 0.9002701044082642, "learning_rate": 0.00011242180518319929, "loss": 0.0822, "step": 1960 }, { "epoch": 1.7605004468275247, "grad_norm": 0.03169967234134674, "learning_rate": 0.00011197497765862377, "loss": 0.0951, "step": 1970 }, { "epoch": 1.7694369973190347, "grad_norm": 0.07693292945623398, "learning_rate": 0.00011152815013404827, "loss": 0.11, "step": 1980 }, { "epoch": 1.7783735478105451, "grad_norm": 0.06315601617097855, "learning_rate": 0.00011108132260947275, "loss": 0.1217, "step": 1990 }, { "epoch": 1.7873100983020556, "grad_norm": 0.26389381289482117, "learning_rate": 0.00011063449508489723, "loss": 0.1077, "step": 2000 }, { "epoch": 1.7873100983020556, "eval_accuracy": 0.9668341708542714, "eval_loss": 0.12829196453094482, "eval_runtime": 56.3081, "eval_samples_per_second": 35.341, "eval_steps_per_second": 4.422, "step": 2000 }, { "epoch": 1.7962466487935655, "grad_norm": 0.14058926701545715, "learning_rate": 0.00011018766756032173, "loss": 0.051, "step": 2010 }, { "epoch": 1.805183199285076, "grad_norm": 0.8464193940162659, "learning_rate": 0.00010974084003574621, "loss": 0.0557, "step": 2020 }, { "epoch": 1.8141197497765862, "grad_norm": 0.5524567365646362, "learning_rate": 0.00010929401251117069, "loss": 0.0327, "step": 2030 }, { "epoch": 1.8230563002680964, "grad_norm": 4.706042289733887, "learning_rate": 0.00010884718498659518, "loss": 0.0815, "step": 2040 }, { "epoch": 1.8319928507596068, "grad_norm": 5.365744113922119, "learning_rate": 0.00010840035746201967, "loss": 0.0617, "step": 2050 }, { "epoch": 1.840929401251117, "grad_norm": 1.1039865016937256, "learning_rate": 0.00010795352993744415, "loss": 0.0528, "step": 2060 }, { "epoch": 1.8498659517426272, "grad_norm": 2.8230929374694824, "learning_rate": 0.00010750670241286864, "loss": 0.0534, "step": 2070 }, { "epoch": 1.8588025022341377, "grad_norm": 0.02104310691356659, "learning_rate": 0.00010705987488829313, "loss": 0.1058, "step": 2080 }, { "epoch": 1.8677390527256479, "grad_norm": 0.030116664245724678, "learning_rate": 0.0001066130473637176, "loss": 0.0971, "step": 2090 }, { "epoch": 1.876675603217158, "grad_norm": 0.5036576986312866, "learning_rate": 0.0001061662198391421, "loss": 0.0693, "step": 2100 }, { "epoch": 1.8856121537086685, "grad_norm": 4.131002426147461, "learning_rate": 0.00010571939231456658, "loss": 0.0933, "step": 2110 }, { "epoch": 1.8945487042001787, "grad_norm": 5.004481792449951, "learning_rate": 0.00010527256478999108, "loss": 0.0698, "step": 2120 }, { "epoch": 1.903485254691689, "grad_norm": 0.014153541065752506, "learning_rate": 0.00010482573726541556, "loss": 0.0598, "step": 2130 }, { "epoch": 1.9124218051831994, "grad_norm": 0.39952540397644043, "learning_rate": 0.00010437890974084004, "loss": 0.1169, "step": 2140 }, { "epoch": 1.9213583556747096, "grad_norm": 5.047325611114502, "learning_rate": 0.00010393208221626454, "loss": 0.1492, "step": 2150 }, { "epoch": 1.9302949061662198, "grad_norm": 0.045367881655693054, "learning_rate": 0.00010348525469168902, "loss": 0.081, "step": 2160 }, { "epoch": 1.9392314566577302, "grad_norm": 0.02820589952170849, "learning_rate": 0.00010303842716711349, "loss": 0.1456, "step": 2170 }, { "epoch": 1.9481680071492404, "grad_norm": 0.15606756508350372, "learning_rate": 0.000102591599642538, "loss": 0.0484, "step": 2180 }, { "epoch": 1.9571045576407506, "grad_norm": 4.374292850494385, "learning_rate": 0.00010214477211796248, "loss": 0.1133, "step": 2190 }, { "epoch": 1.966041108132261, "grad_norm": 0.6300436854362488, "learning_rate": 0.00010169794459338695, "loss": 0.0159, "step": 2200 }, { "epoch": 1.974977658623771, "grad_norm": 0.011597417294979095, "learning_rate": 0.00010125111706881146, "loss": 0.019, "step": 2210 }, { "epoch": 1.9839142091152815, "grad_norm": 0.013629280962049961, "learning_rate": 0.00010080428954423592, "loss": 0.0953, "step": 2220 }, { "epoch": 1.992850759606792, "grad_norm": 4.461750030517578, "learning_rate": 0.0001003574620196604, "loss": 0.1169, "step": 2230 }, { "epoch": 2.001787310098302, "grad_norm": 0.2028690129518509, "learning_rate": 9.99106344950849e-05, "loss": 0.0515, "step": 2240 }, { "epoch": 2.0107238605898123, "grad_norm": 0.683179497718811, "learning_rate": 9.946380697050938e-05, "loss": 0.0414, "step": 2250 }, { "epoch": 2.0196604110813228, "grad_norm": 0.3097274601459503, "learning_rate": 9.901697944593388e-05, "loss": 0.013, "step": 2260 }, { "epoch": 2.0285969615728328, "grad_norm": 0.02391964942216873, "learning_rate": 9.857015192135836e-05, "loss": 0.0143, "step": 2270 }, { "epoch": 2.037533512064343, "grad_norm": 0.02549424022436142, "learning_rate": 9.812332439678284e-05, "loss": 0.0321, "step": 2280 }, { "epoch": 2.0464700625558536, "grad_norm": 0.015907390043139458, "learning_rate": 9.767649687220734e-05, "loss": 0.0905, "step": 2290 }, { "epoch": 2.0554066130473636, "grad_norm": 0.04600854963064194, "learning_rate": 9.722966934763182e-05, "loss": 0.0055, "step": 2300 }, { "epoch": 2.064343163538874, "grad_norm": 0.17837274074554443, "learning_rate": 9.67828418230563e-05, "loss": 0.0792, "step": 2310 }, { "epoch": 2.0732797140303845, "grad_norm": 0.678176760673523, "learning_rate": 9.63360142984808e-05, "loss": 0.1025, "step": 2320 }, { "epoch": 2.0822162645218945, "grad_norm": 0.047438375651836395, "learning_rate": 9.588918677390528e-05, "loss": 0.0037, "step": 2330 }, { "epoch": 2.091152815013405, "grad_norm": 0.3825267553329468, "learning_rate": 9.544235924932976e-05, "loss": 0.0271, "step": 2340 }, { "epoch": 2.1000893655049153, "grad_norm": 0.022976990789175034, "learning_rate": 9.499553172475425e-05, "loss": 0.0055, "step": 2350 }, { "epoch": 2.1090259159964253, "grad_norm": 0.21945427358150482, "learning_rate": 9.454870420017874e-05, "loss": 0.0072, "step": 2360 }, { "epoch": 2.1179624664879357, "grad_norm": 0.020401885733008385, "learning_rate": 9.410187667560322e-05, "loss": 0.0045, "step": 2370 }, { "epoch": 2.126899016979446, "grad_norm": 0.3614647388458252, "learning_rate": 9.365504915102771e-05, "loss": 0.0292, "step": 2380 }, { "epoch": 2.135835567470956, "grad_norm": 0.01699133589863777, "learning_rate": 9.32082216264522e-05, "loss": 0.0728, "step": 2390 }, { "epoch": 2.1447721179624666, "grad_norm": 0.012751326896250248, "learning_rate": 9.276139410187668e-05, "loss": 0.0358, "step": 2400 }, { "epoch": 2.1537086684539766, "grad_norm": 0.009738125838339329, "learning_rate": 9.231456657730116e-05, "loss": 0.0415, "step": 2410 }, { "epoch": 2.162645218945487, "grad_norm": 0.012577983550727367, "learning_rate": 9.186773905272565e-05, "loss": 0.0204, "step": 2420 }, { "epoch": 2.1715817694369974, "grad_norm": 0.022706875577569008, "learning_rate": 9.142091152815015e-05, "loss": 0.0391, "step": 2430 }, { "epoch": 2.1805183199285074, "grad_norm": 1.2650375366210938, "learning_rate": 9.097408400357462e-05, "loss": 0.005, "step": 2440 }, { "epoch": 2.189454870420018, "grad_norm": 0.012098530307412148, "learning_rate": 9.052725647899911e-05, "loss": 0.0631, "step": 2450 }, { "epoch": 2.1983914209115283, "grad_norm": 0.014217260293662548, "learning_rate": 9.00804289544236e-05, "loss": 0.0158, "step": 2460 }, { "epoch": 2.2073279714030383, "grad_norm": 9.968586921691895, "learning_rate": 8.963360142984808e-05, "loss": 0.0338, "step": 2470 }, { "epoch": 2.2162645218945487, "grad_norm": 0.008608737029135227, "learning_rate": 8.918677390527257e-05, "loss": 0.0344, "step": 2480 }, { "epoch": 2.225201072386059, "grad_norm": 0.0957435816526413, "learning_rate": 8.873994638069705e-05, "loss": 0.0346, "step": 2490 }, { "epoch": 2.234137622877569, "grad_norm": 0.009171651676297188, "learning_rate": 8.829311885612154e-05, "loss": 0.0534, "step": 2500 }, { "epoch": 2.2430741733690795, "grad_norm": 0.025571748614311218, "learning_rate": 8.784629133154603e-05, "loss": 0.0046, "step": 2510 }, { "epoch": 2.25201072386059, "grad_norm": 0.008803543634712696, "learning_rate": 8.739946380697051e-05, "loss": 0.0104, "step": 2520 }, { "epoch": 2.2609472743521, "grad_norm": 0.009746580384671688, "learning_rate": 8.6952636282395e-05, "loss": 0.0194, "step": 2530 }, { "epoch": 2.2698838248436104, "grad_norm": 4.104613780975342, "learning_rate": 8.650580875781949e-05, "loss": 0.0155, "step": 2540 }, { "epoch": 2.278820375335121, "grad_norm": 0.01826513558626175, "learning_rate": 8.605898123324397e-05, "loss": 0.0072, "step": 2550 }, { "epoch": 2.287756925826631, "grad_norm": 0.03380773961544037, "learning_rate": 8.561215370866847e-05, "loss": 0.0515, "step": 2560 }, { "epoch": 2.2966934763181412, "grad_norm": 0.13917675614356995, "learning_rate": 8.516532618409293e-05, "loss": 0.0553, "step": 2570 }, { "epoch": 2.3056300268096512, "grad_norm": 3.9170970916748047, "learning_rate": 8.471849865951743e-05, "loss": 0.0252, "step": 2580 }, { "epoch": 2.3145665773011617, "grad_norm": 0.02010478265583515, "learning_rate": 8.427167113494193e-05, "loss": 0.0212, "step": 2590 }, { "epoch": 2.323503127792672, "grad_norm": 0.008358313702046871, "learning_rate": 8.38248436103664e-05, "loss": 0.1032, "step": 2600 }, { "epoch": 2.3324396782841825, "grad_norm": 0.08038530498743057, "learning_rate": 8.337801608579089e-05, "loss": 0.0445, "step": 2610 }, { "epoch": 2.3413762287756925, "grad_norm": 0.03653928264975548, "learning_rate": 8.293118856121538e-05, "loss": 0.0396, "step": 2620 }, { "epoch": 2.350312779267203, "grad_norm": 0.027160342782735825, "learning_rate": 8.248436103663985e-05, "loss": 0.0305, "step": 2630 }, { "epoch": 2.359249329758713, "grad_norm": 0.015198041684925556, "learning_rate": 8.203753351206435e-05, "loss": 0.0377, "step": 2640 }, { "epoch": 2.3681858802502234, "grad_norm": 0.03799434006214142, "learning_rate": 8.159070598748883e-05, "loss": 0.0057, "step": 2650 }, { "epoch": 2.377122430741734, "grad_norm": 0.008046945556998253, "learning_rate": 8.114387846291331e-05, "loss": 0.0249, "step": 2660 }, { "epoch": 2.386058981233244, "grad_norm": 8.727446556091309, "learning_rate": 8.069705093833781e-05, "loss": 0.0466, "step": 2670 }, { "epoch": 2.394995531724754, "grad_norm": 0.01986142434179783, "learning_rate": 8.025022341376229e-05, "loss": 0.0357, "step": 2680 }, { "epoch": 2.4039320822162646, "grad_norm": 7.71134614944458, "learning_rate": 7.980339588918678e-05, "loss": 0.015, "step": 2690 }, { "epoch": 2.4128686327077746, "grad_norm": 0.04247535765171051, "learning_rate": 7.935656836461127e-05, "loss": 0.0165, "step": 2700 }, { "epoch": 2.421805183199285, "grad_norm": 0.008588094264268875, "learning_rate": 7.890974084003575e-05, "loss": 0.0039, "step": 2710 }, { "epoch": 2.4307417336907955, "grad_norm": 0.11789193749427795, "learning_rate": 7.846291331546024e-05, "loss": 0.0344, "step": 2720 }, { "epoch": 2.4396782841823055, "grad_norm": 0.02231294848024845, "learning_rate": 7.801608579088472e-05, "loss": 0.0248, "step": 2730 }, { "epoch": 2.448614834673816, "grad_norm": 0.017268147319555283, "learning_rate": 7.75692582663092e-05, "loss": 0.0716, "step": 2740 }, { "epoch": 2.4575513851653263, "grad_norm": 8.963982582092285, "learning_rate": 7.71224307417337e-05, "loss": 0.0282, "step": 2750 }, { "epoch": 2.4664879356568363, "grad_norm": 0.799085259437561, "learning_rate": 7.667560321715817e-05, "loss": 0.0416, "step": 2760 }, { "epoch": 2.4754244861483468, "grad_norm": 0.15468931198120117, "learning_rate": 7.622877569258267e-05, "loss": 0.0669, "step": 2770 }, { "epoch": 2.484361036639857, "grad_norm": 3.4924068450927734, "learning_rate": 7.578194816800716e-05, "loss": 0.0477, "step": 2780 }, { "epoch": 2.493297587131367, "grad_norm": 0.012834394350647926, "learning_rate": 7.533512064343163e-05, "loss": 0.0174, "step": 2790 }, { "epoch": 2.5022341376228776, "grad_norm": 0.039204515516757965, "learning_rate": 7.488829311885612e-05, "loss": 0.0699, "step": 2800 }, { "epoch": 2.5111706881143876, "grad_norm": 0.08284445852041245, "learning_rate": 7.444146559428062e-05, "loss": 0.0445, "step": 2810 }, { "epoch": 2.520107238605898, "grad_norm": 0.010827134363353252, "learning_rate": 7.39946380697051e-05, "loss": 0.043, "step": 2820 }, { "epoch": 2.5290437890974085, "grad_norm": 3.5454938411712646, "learning_rate": 7.354781054512958e-05, "loss": 0.0339, "step": 2830 }, { "epoch": 2.537980339588919, "grad_norm": 0.006842234171926975, "learning_rate": 7.310098302055406e-05, "loss": 0.0029, "step": 2840 }, { "epoch": 2.546916890080429, "grad_norm": 0.7790193557739258, "learning_rate": 7.265415549597856e-05, "loss": 0.0055, "step": 2850 }, { "epoch": 2.5558534405719393, "grad_norm": 0.022239111363887787, "learning_rate": 7.220732797140304e-05, "loss": 0.008, "step": 2860 }, { "epoch": 2.5647899910634493, "grad_norm": 0.05403418838977814, "learning_rate": 7.176050044682752e-05, "loss": 0.057, "step": 2870 }, { "epoch": 2.5737265415549597, "grad_norm": 0.008923870511353016, "learning_rate": 7.131367292225202e-05, "loss": 0.0045, "step": 2880 }, { "epoch": 2.58266309204647, "grad_norm": 0.02668040059506893, "learning_rate": 7.08668453976765e-05, "loss": 0.0551, "step": 2890 }, { "epoch": 2.5915996425379806, "grad_norm": 0.049835577607154846, "learning_rate": 7.042001787310098e-05, "loss": 0.0255, "step": 2900 }, { "epoch": 2.6005361930294906, "grad_norm": 0.19334334135055542, "learning_rate": 6.997319034852548e-05, "loss": 0.0434, "step": 2910 }, { "epoch": 2.609472743521001, "grad_norm": 2.9139554500579834, "learning_rate": 6.952636282394996e-05, "loss": 0.0069, "step": 2920 }, { "epoch": 2.618409294012511, "grad_norm": 0.006679228041321039, "learning_rate": 6.907953529937444e-05, "loss": 0.0021, "step": 2930 }, { "epoch": 2.6273458445040214, "grad_norm": 0.1680416613817215, "learning_rate": 6.863270777479894e-05, "loss": 0.0249, "step": 2940 }, { "epoch": 2.636282394995532, "grad_norm": 0.08290654420852661, "learning_rate": 6.818588025022342e-05, "loss": 0.029, "step": 2950 }, { "epoch": 2.645218945487042, "grad_norm": 0.013707391917705536, "learning_rate": 6.77390527256479e-05, "loss": 0.0124, "step": 2960 }, { "epoch": 2.6541554959785523, "grad_norm": 0.2275378704071045, "learning_rate": 6.72922252010724e-05, "loss": 0.035, "step": 2970 }, { "epoch": 2.6630920464700627, "grad_norm": 0.5669155716896057, "learning_rate": 6.684539767649688e-05, "loss": 0.0288, "step": 2980 }, { "epoch": 2.6720285969615727, "grad_norm": 0.01488091703504324, "learning_rate": 6.639857015192136e-05, "loss": 0.0438, "step": 2990 }, { "epoch": 2.680965147453083, "grad_norm": 3.9659953117370605, "learning_rate": 6.595174262734584e-05, "loss": 0.0652, "step": 3000 }, { "epoch": 2.680965147453083, "eval_accuracy": 0.9793969849246231, "eval_loss": 0.08239442110061646, "eval_runtime": 56.1213, "eval_samples_per_second": 35.459, "eval_steps_per_second": 4.437, "step": 3000 }, { "epoch": 2.6899016979445936, "grad_norm": 8.31395149230957, "learning_rate": 6.550491510277034e-05, "loss": 0.0098, "step": 3010 }, { "epoch": 2.6988382484361035, "grad_norm": 0.008468572981655598, "learning_rate": 6.505808757819482e-05, "loss": 0.1056, "step": 3020 }, { "epoch": 2.707774798927614, "grad_norm": 0.9328808188438416, "learning_rate": 6.46112600536193e-05, "loss": 0.0769, "step": 3030 }, { "epoch": 2.716711349419124, "grad_norm": 0.6114912629127502, "learning_rate": 6.41644325290438e-05, "loss": 0.0434, "step": 3040 }, { "epoch": 2.7256478999106344, "grad_norm": 0.03709472343325615, "learning_rate": 6.371760500446829e-05, "loss": 0.0166, "step": 3050 }, { "epoch": 2.734584450402145, "grad_norm": 0.1086587980389595, "learning_rate": 6.327077747989276e-05, "loss": 0.0047, "step": 3060 }, { "epoch": 2.7435210008936552, "grad_norm": 0.12008140981197357, "learning_rate": 6.282394995531725e-05, "loss": 0.0069, "step": 3070 }, { "epoch": 2.7524575513851652, "grad_norm": 0.017355024814605713, "learning_rate": 6.237712243074174e-05, "loss": 0.0033, "step": 3080 }, { "epoch": 2.7613941018766757, "grad_norm": 0.15070508420467377, "learning_rate": 6.193029490616622e-05, "loss": 0.0476, "step": 3090 }, { "epoch": 2.7703306523681857, "grad_norm": 0.022527649998664856, "learning_rate": 6.148346738159071e-05, "loss": 0.0243, "step": 3100 }, { "epoch": 2.779267202859696, "grad_norm": 0.37779930233955383, "learning_rate": 6.10366398570152e-05, "loss": 0.0058, "step": 3110 }, { "epoch": 2.7882037533512065, "grad_norm": 0.029893942177295685, "learning_rate": 6.0589812332439676e-05, "loss": 0.0208, "step": 3120 }, { "epoch": 2.797140303842717, "grad_norm": 0.01635076478123665, "learning_rate": 6.0142984807864165e-05, "loss": 0.0026, "step": 3130 }, { "epoch": 2.806076854334227, "grad_norm": 0.011868173256516457, "learning_rate": 5.969615728328865e-05, "loss": 0.0257, "step": 3140 }, { "epoch": 2.8150134048257374, "grad_norm": 0.02559722028672695, "learning_rate": 5.9249329758713135e-05, "loss": 0.0666, "step": 3150 }, { "epoch": 2.8239499553172474, "grad_norm": 0.01763424649834633, "learning_rate": 5.8802502234137623e-05, "loss": 0.0611, "step": 3160 }, { "epoch": 2.832886505808758, "grad_norm": 0.02686423808336258, "learning_rate": 5.835567470956211e-05, "loss": 0.0039, "step": 3170 }, { "epoch": 2.841823056300268, "grad_norm": 0.04632404074072838, "learning_rate": 5.79088471849866e-05, "loss": 0.0122, "step": 3180 }, { "epoch": 2.8507596067917786, "grad_norm": 0.1586790531873703, "learning_rate": 5.746201966041108e-05, "loss": 0.0026, "step": 3190 }, { "epoch": 2.8596961572832886, "grad_norm": 5.425605297088623, "learning_rate": 5.701519213583557e-05, "loss": 0.0622, "step": 3200 }, { "epoch": 2.868632707774799, "grad_norm": 0.006181008648127317, "learning_rate": 5.656836461126006e-05, "loss": 0.0028, "step": 3210 }, { "epoch": 2.877569258266309, "grad_norm": 0.09517185389995575, "learning_rate": 5.612153708668454e-05, "loss": 0.0035, "step": 3220 }, { "epoch": 2.8865058087578195, "grad_norm": 0.015022194012999535, "learning_rate": 5.567470956210903e-05, "loss": 0.0285, "step": 3230 }, { "epoch": 2.89544235924933, "grad_norm": 4.772485256195068, "learning_rate": 5.522788203753352e-05, "loss": 0.0279, "step": 3240 }, { "epoch": 2.90437890974084, "grad_norm": 3.1032145023345947, "learning_rate": 5.478105451295799e-05, "loss": 0.0049, "step": 3250 }, { "epoch": 2.9133154602323503, "grad_norm": 0.05868244543671608, "learning_rate": 5.433422698838249e-05, "loss": 0.0029, "step": 3260 }, { "epoch": 2.9222520107238603, "grad_norm": 0.008307090029120445, "learning_rate": 5.388739946380698e-05, "loss": 0.0099, "step": 3270 }, { "epoch": 2.9311885612153707, "grad_norm": 0.010392882861196995, "learning_rate": 5.344057193923145e-05, "loss": 0.002, "step": 3280 }, { "epoch": 2.940125111706881, "grad_norm": 0.005523020401597023, "learning_rate": 5.299374441465594e-05, "loss": 0.0035, "step": 3290 }, { "epoch": 2.9490616621983916, "grad_norm": 0.06098335236310959, "learning_rate": 5.2546916890080436e-05, "loss": 0.0056, "step": 3300 }, { "epoch": 2.9579982126899016, "grad_norm": 0.013083376921713352, "learning_rate": 5.2100089365504925e-05, "loss": 0.023, "step": 3310 }, { "epoch": 2.966934763181412, "grad_norm": 0.01605415530502796, "learning_rate": 5.16532618409294e-05, "loss": 0.0396, "step": 3320 }, { "epoch": 2.975871313672922, "grad_norm": 0.013243346475064754, "learning_rate": 5.120643431635389e-05, "loss": 0.0083, "step": 3330 }, { "epoch": 2.9848078641644324, "grad_norm": 1.4108890295028687, "learning_rate": 5.0759606791778383e-05, "loss": 0.0468, "step": 3340 }, { "epoch": 2.993744414655943, "grad_norm": 0.5704414248466492, "learning_rate": 5.031277926720286e-05, "loss": 0.0209, "step": 3350 }, { "epoch": 3.002680965147453, "grad_norm": 0.03908452019095421, "learning_rate": 4.986595174262735e-05, "loss": 0.0779, "step": 3360 }, { "epoch": 3.0116175156389633, "grad_norm": 0.010959290899336338, "learning_rate": 4.9419124218051835e-05, "loss": 0.0109, "step": 3370 }, { "epoch": 3.0205540661304737, "grad_norm": 0.028490547090768814, "learning_rate": 4.8972296693476324e-05, "loss": 0.0025, "step": 3380 }, { "epoch": 3.0294906166219837, "grad_norm": 0.00491972966119647, "learning_rate": 4.8525469168900806e-05, "loss": 0.0214, "step": 3390 }, { "epoch": 3.038427167113494, "grad_norm": 0.014270992018282413, "learning_rate": 4.8078641644325294e-05, "loss": 0.0048, "step": 3400 }, { "epoch": 3.0473637176050046, "grad_norm": 0.00458119623363018, "learning_rate": 4.7631814119749776e-05, "loss": 0.0308, "step": 3410 }, { "epoch": 3.0563002680965146, "grad_norm": 0.00890402402728796, "learning_rate": 4.7184986595174265e-05, "loss": 0.0019, "step": 3420 }, { "epoch": 3.065236818588025, "grad_norm": 0.004751246422529221, "learning_rate": 4.673815907059875e-05, "loss": 0.0028, "step": 3430 }, { "epoch": 3.0741733690795354, "grad_norm": 0.008143426850438118, "learning_rate": 4.6291331546023235e-05, "loss": 0.0015, "step": 3440 }, { "epoch": 3.0831099195710454, "grad_norm": 0.035306982696056366, "learning_rate": 4.5844504021447723e-05, "loss": 0.0561, "step": 3450 }, { "epoch": 3.092046470062556, "grad_norm": 0.006312028504908085, "learning_rate": 4.539767649687221e-05, "loss": 0.0407, "step": 3460 }, { "epoch": 3.1009830205540663, "grad_norm": 0.012918233871459961, "learning_rate": 4.4950848972296694e-05, "loss": 0.0204, "step": 3470 }, { "epoch": 3.1099195710455763, "grad_norm": 0.03429726883769035, "learning_rate": 4.450402144772118e-05, "loss": 0.0138, "step": 3480 }, { "epoch": 3.1188561215370867, "grad_norm": 0.032142043113708496, "learning_rate": 4.405719392314567e-05, "loss": 0.0074, "step": 3490 }, { "epoch": 3.127792672028597, "grad_norm": 0.11621160060167313, "learning_rate": 4.361036639857015e-05, "loss": 0.007, "step": 3500 }, { "epoch": 3.136729222520107, "grad_norm": 0.010225760750472546, "learning_rate": 4.316353887399464e-05, "loss": 0.0371, "step": 3510 }, { "epoch": 3.1456657730116175, "grad_norm": 0.0270242840051651, "learning_rate": 4.271671134941912e-05, "loss": 0.0024, "step": 3520 }, { "epoch": 3.154602323503128, "grad_norm": 0.561730146408081, "learning_rate": 4.226988382484361e-05, "loss": 0.0322, "step": 3530 }, { "epoch": 3.163538873994638, "grad_norm": 3.7698066234588623, "learning_rate": 4.18230563002681e-05, "loss": 0.0061, "step": 3540 }, { "epoch": 3.1724754244861484, "grad_norm": 0.08852257579565048, "learning_rate": 4.137622877569258e-05, "loss": 0.002, "step": 3550 }, { "epoch": 3.181411974977659, "grad_norm": 0.010241570882499218, "learning_rate": 4.092940125111707e-05, "loss": 0.0032, "step": 3560 }, { "epoch": 3.190348525469169, "grad_norm": 0.02900160290300846, "learning_rate": 4.048257372654156e-05, "loss": 0.0021, "step": 3570 }, { "epoch": 3.1992850759606792, "grad_norm": 0.012413430958986282, "learning_rate": 4.003574620196605e-05, "loss": 0.0016, "step": 3580 }, { "epoch": 3.2082216264521897, "grad_norm": 0.011820780113339424, "learning_rate": 3.958891867739053e-05, "loss": 0.0156, "step": 3590 }, { "epoch": 3.2171581769436997, "grad_norm": 0.0063424003310501575, "learning_rate": 3.914209115281501e-05, "loss": 0.0066, "step": 3600 }, { "epoch": 3.22609472743521, "grad_norm": 0.014534726738929749, "learning_rate": 3.8695263628239506e-05, "loss": 0.0023, "step": 3610 }, { "epoch": 3.23503127792672, "grad_norm": 0.0037305313162505627, "learning_rate": 3.824843610366399e-05, "loss": 0.0014, "step": 3620 }, { "epoch": 3.2439678284182305, "grad_norm": 0.004174220375716686, "learning_rate": 3.780160857908847e-05, "loss": 0.0022, "step": 3630 }, { "epoch": 3.252904378909741, "grad_norm": 0.02620732970535755, "learning_rate": 3.735478105451296e-05, "loss": 0.0033, "step": 3640 }, { "epoch": 3.2618409294012514, "grad_norm": 0.008887135423719883, "learning_rate": 3.690795352993745e-05, "loss": 0.0137, "step": 3650 }, { "epoch": 3.2707774798927614, "grad_norm": 0.0036694956943392754, "learning_rate": 3.6461126005361935e-05, "loss": 0.0016, "step": 3660 }, { "epoch": 3.279714030384272, "grad_norm": 0.005121259950101376, "learning_rate": 3.601429848078642e-05, "loss": 0.0023, "step": 3670 }, { "epoch": 3.2886505808757818, "grad_norm": 0.005332967732101679, "learning_rate": 3.55674709562109e-05, "loss": 0.0508, "step": 3680 }, { "epoch": 3.297587131367292, "grad_norm": 0.008636276237666607, "learning_rate": 3.5120643431635394e-05, "loss": 0.0015, "step": 3690 }, { "epoch": 3.3065236818588026, "grad_norm": 0.004048788454383612, "learning_rate": 3.4673815907059876e-05, "loss": 0.0015, "step": 3700 }, { "epoch": 3.3154602323503126, "grad_norm": 0.013148046098649502, "learning_rate": 3.4226988382484365e-05, "loss": 0.0021, "step": 3710 }, { "epoch": 3.324396782841823, "grad_norm": 0.003611048450693488, "learning_rate": 3.3780160857908846e-05, "loss": 0.0018, "step": 3720 }, { "epoch": 3.3333333333333335, "grad_norm": 0.0047615463845431805, "learning_rate": 3.3333333333333335e-05, "loss": 0.002, "step": 3730 }, { "epoch": 3.3422698838248435, "grad_norm": 0.052058279514312744, "learning_rate": 3.2886505808757823e-05, "loss": 0.0017, "step": 3740 }, { "epoch": 3.351206434316354, "grad_norm": 0.004867528565227985, "learning_rate": 3.2439678284182305e-05, "loss": 0.0015, "step": 3750 }, { "epoch": 3.3601429848078643, "grad_norm": 0.005437952931970358, "learning_rate": 3.1992850759606794e-05, "loss": 0.0027, "step": 3760 }, { "epoch": 3.3690795352993743, "grad_norm": 0.08657950907945633, "learning_rate": 3.154602323503128e-05, "loss": 0.0012, "step": 3770 }, { "epoch": 3.3780160857908847, "grad_norm": 0.003917807713150978, "learning_rate": 3.1099195710455764e-05, "loss": 0.0016, "step": 3780 }, { "epoch": 3.386952636282395, "grad_norm": 0.03561088442802429, "learning_rate": 3.065236818588025e-05, "loss": 0.0063, "step": 3790 }, { "epoch": 3.395889186773905, "grad_norm": 0.021922320127487183, "learning_rate": 3.0205540661304738e-05, "loss": 0.0195, "step": 3800 }, { "epoch": 3.4048257372654156, "grad_norm": 0.009989120066165924, "learning_rate": 2.9758713136729223e-05, "loss": 0.0027, "step": 3810 }, { "epoch": 3.413762287756926, "grad_norm": 0.004757929127663374, "learning_rate": 2.931188561215371e-05, "loss": 0.0213, "step": 3820 }, { "epoch": 3.422698838248436, "grad_norm": 0.005087022669613361, "learning_rate": 2.8865058087578197e-05, "loss": 0.012, "step": 3830 }, { "epoch": 3.4316353887399464, "grad_norm": 0.04014687240123749, "learning_rate": 2.8418230563002685e-05, "loss": 0.0016, "step": 3840 }, { "epoch": 3.4405719392314564, "grad_norm": 0.008556324057281017, "learning_rate": 2.7971403038427167e-05, "loss": 0.0466, "step": 3850 }, { "epoch": 3.449508489722967, "grad_norm": 0.0066629331558942795, "learning_rate": 2.7524575513851652e-05, "loss": 0.0013, "step": 3860 }, { "epoch": 3.4584450402144773, "grad_norm": 0.007047568913549185, "learning_rate": 2.707774798927614e-05, "loss": 0.0015, "step": 3870 }, { "epoch": 3.4673815907059877, "grad_norm": 0.0033304065000265837, "learning_rate": 2.6630920464700626e-05, "loss": 0.0186, "step": 3880 }, { "epoch": 3.4763181411974977, "grad_norm": 0.043915342539548874, "learning_rate": 2.6184092940125114e-05, "loss": 0.0013, "step": 3890 }, { "epoch": 3.485254691689008, "grad_norm": 0.005252317525446415, "learning_rate": 2.57372654155496e-05, "loss": 0.0036, "step": 3900 }, { "epoch": 3.494191242180518, "grad_norm": 0.005055012181401253, "learning_rate": 2.5290437890974085e-05, "loss": 0.0012, "step": 3910 }, { "epoch": 3.5031277926720286, "grad_norm": 0.0049805790185928345, "learning_rate": 2.484361036639857e-05, "loss": 0.0157, "step": 3920 }, { "epoch": 3.512064343163539, "grad_norm": 0.009514909237623215, "learning_rate": 2.439678284182306e-05, "loss": 0.0109, "step": 3930 }, { "epoch": 3.5210008936550494, "grad_norm": 0.03643026947975159, "learning_rate": 2.3949955317247544e-05, "loss": 0.0019, "step": 3940 }, { "epoch": 3.5299374441465594, "grad_norm": 0.056902140378952026, "learning_rate": 2.3503127792672032e-05, "loss": 0.0016, "step": 3950 }, { "epoch": 3.53887399463807, "grad_norm": 0.10358071327209473, "learning_rate": 2.3056300268096514e-05, "loss": 0.005, "step": 3960 }, { "epoch": 3.54781054512958, "grad_norm": 0.005386151373386383, "learning_rate": 2.2609472743521002e-05, "loss": 0.0021, "step": 3970 }, { "epoch": 3.5567470956210903, "grad_norm": 0.007350238971412182, "learning_rate": 2.2162645218945488e-05, "loss": 0.0016, "step": 3980 }, { "epoch": 3.5656836461126007, "grad_norm": 0.07326429337263107, "learning_rate": 2.1715817694369976e-05, "loss": 0.0084, "step": 3990 }, { "epoch": 3.5746201966041107, "grad_norm": 0.005603461060672998, "learning_rate": 2.126899016979446e-05, "loss": 0.0011, "step": 4000 }, { "epoch": 3.5746201966041107, "eval_accuracy": 0.9814070351758793, "eval_loss": 0.0710952952504158, "eval_runtime": 56.3405, "eval_samples_per_second": 35.321, "eval_steps_per_second": 4.42, "step": 4000 }, { "epoch": 3.583556747095621, "grad_norm": 0.010818341746926308, "learning_rate": 2.0822162645218946e-05, "loss": 0.0038, "step": 4010 }, { "epoch": 3.592493297587131, "grad_norm": 0.003599151037633419, "learning_rate": 2.037533512064343e-05, "loss": 0.0065, "step": 4020 }, { "epoch": 3.6014298480786415, "grad_norm": 4.198567867279053, "learning_rate": 1.992850759606792e-05, "loss": 0.0083, "step": 4030 }, { "epoch": 3.610366398570152, "grad_norm": 0.013494855724275112, "learning_rate": 1.9481680071492405e-05, "loss": 0.0045, "step": 4040 }, { "epoch": 3.6193029490616624, "grad_norm": 0.0036234534345567226, "learning_rate": 1.903485254691689e-05, "loss": 0.0016, "step": 4050 }, { "epoch": 3.6282394995531724, "grad_norm": 0.021920403465628624, "learning_rate": 1.8588025022341376e-05, "loss": 0.0012, "step": 4060 }, { "epoch": 3.637176050044683, "grad_norm": 0.004384295083582401, "learning_rate": 1.8141197497765864e-05, "loss": 0.001, "step": 4070 }, { "epoch": 3.646112600536193, "grad_norm": 0.03161391615867615, "learning_rate": 1.769436997319035e-05, "loss": 0.0026, "step": 4080 }, { "epoch": 3.6550491510277032, "grad_norm": 0.0033394452184438705, "learning_rate": 1.7247542448614838e-05, "loss": 0.0267, "step": 4090 }, { "epoch": 3.6639857015192137, "grad_norm": 0.01090541947633028, "learning_rate": 1.680071492403932e-05, "loss": 0.0014, "step": 4100 }, { "epoch": 3.672922252010724, "grad_norm": 0.0053653959184885025, "learning_rate": 1.6353887399463808e-05, "loss": 0.0095, "step": 4110 }, { "epoch": 3.681858802502234, "grad_norm": 0.032379720360040665, "learning_rate": 1.5907059874888293e-05, "loss": 0.0011, "step": 4120 }, { "epoch": 3.6907953529937445, "grad_norm": 0.05944305285811424, "learning_rate": 1.5460232350312782e-05, "loss": 0.0015, "step": 4130 }, { "epoch": 3.6997319034852545, "grad_norm": 0.0054045203141868114, "learning_rate": 1.5013404825737265e-05, "loss": 0.0012, "step": 4140 }, { "epoch": 3.708668453976765, "grad_norm": 0.003021675394847989, "learning_rate": 1.4566577301161752e-05, "loss": 0.0023, "step": 4150 }, { "epoch": 3.7176050044682754, "grad_norm": 0.007955508306622505, "learning_rate": 1.4119749776586239e-05, "loss": 0.0031, "step": 4160 }, { "epoch": 3.726541554959786, "grad_norm": 0.005485454574227333, "learning_rate": 1.3672922252010726e-05, "loss": 0.0014, "step": 4170 }, { "epoch": 3.7354781054512958, "grad_norm": 0.007910342887043953, "learning_rate": 1.322609472743521e-05, "loss": 0.0014, "step": 4180 }, { "epoch": 3.744414655942806, "grad_norm": 0.011793126352131367, "learning_rate": 1.2779267202859696e-05, "loss": 0.001, "step": 4190 }, { "epoch": 3.753351206434316, "grad_norm": 0.005442539695650339, "learning_rate": 1.2332439678284183e-05, "loss": 0.0015, "step": 4200 }, { "epoch": 3.7622877569258266, "grad_norm": 1.1986395120620728, "learning_rate": 1.188561215370867e-05, "loss": 0.002, "step": 4210 }, { "epoch": 3.771224307417337, "grad_norm": 0.006608502473682165, "learning_rate": 1.1438784629133155e-05, "loss": 0.0009, "step": 4220 }, { "epoch": 3.780160857908847, "grad_norm": 0.0039040117990225554, "learning_rate": 1.0991957104557642e-05, "loss": 0.0013, "step": 4230 }, { "epoch": 3.7890974084003575, "grad_norm": 0.0041880221106112, "learning_rate": 1.0545129579982127e-05, "loss": 0.0012, "step": 4240 }, { "epoch": 3.798033958891868, "grad_norm": 0.003776776837185025, "learning_rate": 1.0098302055406614e-05, "loss": 0.0013, "step": 4250 }, { "epoch": 3.806970509383378, "grad_norm": 0.2970888614654541, "learning_rate": 9.651474530831099e-06, "loss": 0.0014, "step": 4260 }, { "epoch": 3.8159070598748883, "grad_norm": 0.003879937343299389, "learning_rate": 9.204647006255586e-06, "loss": 0.0013, "step": 4270 }, { "epoch": 3.8248436103663987, "grad_norm": 3.5169312953948975, "learning_rate": 8.757819481680071e-06, "loss": 0.0035, "step": 4280 }, { "epoch": 3.8337801608579087, "grad_norm": 0.004920534789562225, "learning_rate": 8.310991957104558e-06, "loss": 0.001, "step": 4290 }, { "epoch": 3.842716711349419, "grad_norm": 0.0035059740766882896, "learning_rate": 7.864164432529045e-06, "loss": 0.0094, "step": 4300 }, { "epoch": 3.851653261840929, "grad_norm": 0.004144645761698484, "learning_rate": 7.41733690795353e-06, "loss": 0.001, "step": 4310 }, { "epoch": 3.8605898123324396, "grad_norm": 0.006385812535881996, "learning_rate": 6.970509383378017e-06, "loss": 0.001, "step": 4320 }, { "epoch": 3.86952636282395, "grad_norm": 0.003677819389849901, "learning_rate": 6.523681858802503e-06, "loss": 0.0009, "step": 4330 }, { "epoch": 3.8784629133154604, "grad_norm": 0.003563833888620138, "learning_rate": 6.076854334226989e-06, "loss": 0.0251, "step": 4340 }, { "epoch": 3.8873994638069704, "grad_norm": 0.012182756327092648, "learning_rate": 5.630026809651475e-06, "loss": 0.0422, "step": 4350 }, { "epoch": 3.896336014298481, "grad_norm": 0.004781792871654034, "learning_rate": 5.1831992850759615e-06, "loss": 0.0021, "step": 4360 }, { "epoch": 3.905272564789991, "grad_norm": 0.003455075901001692, "learning_rate": 4.7363717605004475e-06, "loss": 0.0182, "step": 4370 }, { "epoch": 3.9142091152815013, "grad_norm": 0.00627366965636611, "learning_rate": 4.2895442359249335e-06, "loss": 0.0148, "step": 4380 }, { "epoch": 3.9231456657730117, "grad_norm": 0.015941530466079712, "learning_rate": 3.8427167113494195e-06, "loss": 0.0015, "step": 4390 }, { "epoch": 3.932082216264522, "grad_norm": 0.004724125377833843, "learning_rate": 3.3958891867739055e-06, "loss": 0.0093, "step": 4400 }, { "epoch": 3.941018766756032, "grad_norm": 0.0062377783469855785, "learning_rate": 2.9490616621983915e-06, "loss": 0.0011, "step": 4410 }, { "epoch": 3.9499553172475426, "grad_norm": 0.0042613474652171135, "learning_rate": 2.502234137622878e-06, "loss": 0.0141, "step": 4420 }, { "epoch": 3.9588918677390526, "grad_norm": 0.005040575284510851, "learning_rate": 2.055406613047364e-06, "loss": 0.0061, "step": 4430 }, { "epoch": 3.967828418230563, "grad_norm": 0.004678263328969479, "learning_rate": 1.60857908847185e-06, "loss": 0.0011, "step": 4440 }, { "epoch": 3.9767649687220734, "grad_norm": 0.0033705062232911587, "learning_rate": 1.161751563896336e-06, "loss": 0.0065, "step": 4450 }, { "epoch": 3.9857015192135834, "grad_norm": 0.004543005023151636, "learning_rate": 7.149240393208222e-07, "loss": 0.0011, "step": 4460 }, { "epoch": 3.994638069705094, "grad_norm": 0.01603855937719345, "learning_rate": 2.6809651474530835e-07, "loss": 0.001, "step": 4470 }, { "epoch": 4.0, "step": 4476, "total_flos": 5.549295064059888e+18, "train_loss": 0.1165861947240576, "train_runtime": 2488.5837, "train_samples_per_second": 28.775, "train_steps_per_second": 1.799 } ], "logging_steps": 10, "max_steps": 4476, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.549295064059888e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }