{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9986799924570997, "eval_steps": 500, "global_step": 331, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003017160098057703, "grad_norm": 20.078946749628592, "learning_rate": 0.0, "loss": 1.4468, "step": 1 }, { "epoch": 0.006034320196115406, "grad_norm": 4.878738050586557, "learning_rate": 5e-06, "loss": 1.1585, "step": 2 }, { "epoch": 0.00905148029417311, "grad_norm": 5.517865422306177, "learning_rate": 7.924812503605782e-06, "loss": 1.1812, "step": 3 }, { "epoch": 0.012068640392230812, "grad_norm": 2.1032343536449445, "learning_rate": 1e-05, "loss": 1.1021, "step": 4 }, { "epoch": 0.015085800490288516, "grad_norm": 2.592635550230835, "learning_rate": 1e-05, "loss": 1.1066, "step": 5 }, { "epoch": 0.01810296058834622, "grad_norm": 8.062413826195554, "learning_rate": 1e-05, "loss": 1.0541, "step": 6 }, { "epoch": 0.021120120686403922, "grad_norm": 470.52936338944204, "learning_rate": 1e-05, "loss": 1.1385, "step": 7 }, { "epoch": 0.024137280784461625, "grad_norm": 5.56198780429065, "learning_rate": 1e-05, "loss": 1.141, "step": 8 }, { "epoch": 0.02715444088251933, "grad_norm": 3.1646497668353124, "learning_rate": 1e-05, "loss": 1.0944, "step": 9 }, { "epoch": 0.030171600980577033, "grad_norm": 2.0038474699480413, "learning_rate": 1e-05, "loss": 1.0951, "step": 10 }, { "epoch": 0.033188761078634735, "grad_norm": 1.4888187403163304, "learning_rate": 1e-05, "loss": 1.0766, "step": 11 }, { "epoch": 0.03620592117669244, "grad_norm": 1.7152815980179157, "learning_rate": 1e-05, "loss": 1.0706, "step": 12 }, { "epoch": 0.03922308127475014, "grad_norm": 1.4300762542819367, "learning_rate": 1e-05, "loss": 1.0547, "step": 13 }, { "epoch": 0.042240241372807845, "grad_norm": 1.4526317270279436, "learning_rate": 1e-05, "loss": 1.0137, "step": 14 }, { "epoch": 0.04525740147086555, "grad_norm": 1.2974541466019103, "learning_rate": 1e-05, "loss": 1.0501, "step": 15 }, { "epoch": 0.04827456156892325, "grad_norm": 1.2017850759065756, "learning_rate": 1e-05, "loss": 1.0339, "step": 16 }, { "epoch": 0.051291721666980955, "grad_norm": 1.1650312606754425, "learning_rate": 1e-05, "loss": 1.0284, "step": 17 }, { "epoch": 0.05430888176503866, "grad_norm": 1.218417579737997, "learning_rate": 1e-05, "loss": 1.0101, "step": 18 }, { "epoch": 0.05732604186309636, "grad_norm": 1.1592394623790696, "learning_rate": 1e-05, "loss": 1.006, "step": 19 }, { "epoch": 0.060343201961154065, "grad_norm": 1.2412789400669013, "learning_rate": 1e-05, "loss": 1.0081, "step": 20 }, { "epoch": 0.06336036205921176, "grad_norm": 1.1656784199257328, "learning_rate": 1e-05, "loss": 1.0274, "step": 21 }, { "epoch": 0.06637752215726947, "grad_norm": 1.156112593546092, "learning_rate": 1e-05, "loss": 1.0349, "step": 22 }, { "epoch": 0.06939468225532718, "grad_norm": 1.1792204176695948, "learning_rate": 1e-05, "loss": 0.9633, "step": 23 }, { "epoch": 0.07241184235338488, "grad_norm": 1.1670774133822364, "learning_rate": 1e-05, "loss": 0.9912, "step": 24 }, { "epoch": 0.07542900245144257, "grad_norm": 1.1275491972747138, "learning_rate": 1e-05, "loss": 0.9776, "step": 25 }, { "epoch": 0.07844616254950028, "grad_norm": 1.1802067276415016, "learning_rate": 1e-05, "loss": 1.0141, "step": 26 }, { "epoch": 0.08146332264755798, "grad_norm": 1.228045833889261, "learning_rate": 1e-05, "loss": 0.9936, "step": 27 }, { "epoch": 0.08448048274561569, "grad_norm": 1.2484185459639683, "learning_rate": 1e-05, "loss": 0.9932, "step": 28 }, { "epoch": 0.0874976428436734, "grad_norm": 1.2339564712822966, "learning_rate": 1e-05, "loss": 0.95, "step": 29 }, { "epoch": 0.0905148029417311, "grad_norm": 1.211784252377512, "learning_rate": 1e-05, "loss": 0.9711, "step": 30 }, { "epoch": 0.09353196303978879, "grad_norm": 1.1575974315931057, "learning_rate": 1e-05, "loss": 0.9771, "step": 31 }, { "epoch": 0.0965491231378465, "grad_norm": 1.2117409563580097, "learning_rate": 1e-05, "loss": 0.9495, "step": 32 }, { "epoch": 0.0995662832359042, "grad_norm": 1.109114323976095, "learning_rate": 1e-05, "loss": 0.9782, "step": 33 }, { "epoch": 0.10258344333396191, "grad_norm": 1.0945987254678533, "learning_rate": 1e-05, "loss": 0.9621, "step": 34 }, { "epoch": 0.10560060343201962, "grad_norm": 1.1441153052598225, "learning_rate": 1e-05, "loss": 0.976, "step": 35 }, { "epoch": 0.10861776353007732, "grad_norm": 1.0701754107374204, "learning_rate": 1e-05, "loss": 0.9364, "step": 36 }, { "epoch": 0.11163492362813501, "grad_norm": 1.0982624299638388, "learning_rate": 1e-05, "loss": 0.9873, "step": 37 }, { "epoch": 0.11465208372619272, "grad_norm": 1.1658342852048704, "learning_rate": 1e-05, "loss": 0.9727, "step": 38 }, { "epoch": 0.11766924382425042, "grad_norm": 1.140425640145812, "learning_rate": 1e-05, "loss": 0.962, "step": 39 }, { "epoch": 0.12068640392230813, "grad_norm": 1.2113716827018828, "learning_rate": 1e-05, "loss": 0.9794, "step": 40 }, { "epoch": 0.12370356402036584, "grad_norm": 1.0811873110464318, "learning_rate": 1e-05, "loss": 0.9614, "step": 41 }, { "epoch": 0.12672072411842353, "grad_norm": 1.1319113211223166, "learning_rate": 1e-05, "loss": 0.9588, "step": 42 }, { "epoch": 0.12973788421648125, "grad_norm": 1.0450336690045927, "learning_rate": 1e-05, "loss": 0.951, "step": 43 }, { "epoch": 0.13275504431453894, "grad_norm": 1.0888061493358059, "learning_rate": 1e-05, "loss": 0.9631, "step": 44 }, { "epoch": 0.13577220441259663, "grad_norm": 1.092452045324181, "learning_rate": 1e-05, "loss": 0.9648, "step": 45 }, { "epoch": 0.13878936451065435, "grad_norm": 0.9926260475544896, "learning_rate": 1e-05, "loss": 0.9274, "step": 46 }, { "epoch": 0.14180652460871204, "grad_norm": 1.1102540887315238, "learning_rate": 1e-05, "loss": 0.919, "step": 47 }, { "epoch": 0.14482368470676976, "grad_norm": 1.0120307941022935, "learning_rate": 1e-05, "loss": 0.9526, "step": 48 }, { "epoch": 0.14784084480482745, "grad_norm": 1.0962499453405279, "learning_rate": 1e-05, "loss": 0.9078, "step": 49 }, { "epoch": 0.15085800490288515, "grad_norm": 1.1837778452202325, "learning_rate": 1e-05, "loss": 0.9376, "step": 50 }, { "epoch": 0.15387516500094287, "grad_norm": 1.064164454110782, "learning_rate": 1e-05, "loss": 0.9128, "step": 51 }, { "epoch": 0.15689232509900056, "grad_norm": 1.1508638128558335, "learning_rate": 1e-05, "loss": 0.9533, "step": 52 }, { "epoch": 0.15990948519705828, "grad_norm": 1.1609288313516257, "learning_rate": 1e-05, "loss": 0.9551, "step": 53 }, { "epoch": 0.16292664529511597, "grad_norm": 1.0596156545796473, "learning_rate": 1e-05, "loss": 0.9083, "step": 54 }, { "epoch": 0.1659438053931737, "grad_norm": 1.0399099372944773, "learning_rate": 1e-05, "loss": 0.95, "step": 55 }, { "epoch": 0.16896096549123138, "grad_norm": 1.0914115337494106, "learning_rate": 1e-05, "loss": 0.9159, "step": 56 }, { "epoch": 0.17197812558928907, "grad_norm": 1.1208151691589456, "learning_rate": 1e-05, "loss": 0.9404, "step": 57 }, { "epoch": 0.1749952856873468, "grad_norm": 1.1380068007034254, "learning_rate": 1e-05, "loss": 0.9248, "step": 58 }, { "epoch": 0.17801244578540448, "grad_norm": 1.1725768888546018, "learning_rate": 1e-05, "loss": 0.9013, "step": 59 }, { "epoch": 0.1810296058834622, "grad_norm": 1.0230578555799743, "learning_rate": 1e-05, "loss": 0.9184, "step": 60 }, { "epoch": 0.1840467659815199, "grad_norm": 1.1965830709717151, "learning_rate": 1e-05, "loss": 0.9636, "step": 61 }, { "epoch": 0.18706392607957759, "grad_norm": 1.0854955199511183, "learning_rate": 1e-05, "loss": 0.9149, "step": 62 }, { "epoch": 0.1900810861776353, "grad_norm": 1.0776069945953626, "learning_rate": 1e-05, "loss": 0.895, "step": 63 }, { "epoch": 0.193098246275693, "grad_norm": 1.0925276020001646, "learning_rate": 1e-05, "loss": 0.9017, "step": 64 }, { "epoch": 0.19611540637375072, "grad_norm": 1.051729020151003, "learning_rate": 1e-05, "loss": 0.9168, "step": 65 }, { "epoch": 0.1991325664718084, "grad_norm": 1.1276428621962828, "learning_rate": 1e-05, "loss": 0.9359, "step": 66 }, { "epoch": 0.2021497265698661, "grad_norm": 1.039214095313065, "learning_rate": 1e-05, "loss": 0.9458, "step": 67 }, { "epoch": 0.20516688666792382, "grad_norm": 1.108230624792472, "learning_rate": 1e-05, "loss": 0.9012, "step": 68 }, { "epoch": 0.2081840467659815, "grad_norm": 1.0659907986207233, "learning_rate": 1e-05, "loss": 0.932, "step": 69 }, { "epoch": 0.21120120686403923, "grad_norm": 1.1484942212933387, "learning_rate": 1e-05, "loss": 0.9212, "step": 70 }, { "epoch": 0.21421836696209692, "grad_norm": 1.0080008398377343, "learning_rate": 1e-05, "loss": 0.9335, "step": 71 }, { "epoch": 0.21723552706015464, "grad_norm": 1.099906576331005, "learning_rate": 1e-05, "loss": 0.9478, "step": 72 }, { "epoch": 0.22025268715821233, "grad_norm": 1.002361667650721, "learning_rate": 1e-05, "loss": 0.9215, "step": 73 }, { "epoch": 0.22326984725627003, "grad_norm": 1.0106913848906898, "learning_rate": 1e-05, "loss": 0.9103, "step": 74 }, { "epoch": 0.22628700735432775, "grad_norm": 1.031802831727699, "learning_rate": 1e-05, "loss": 0.9591, "step": 75 }, { "epoch": 0.22930416745238544, "grad_norm": 1.091719276309555, "learning_rate": 1e-05, "loss": 0.9431, "step": 76 }, { "epoch": 0.23232132755044316, "grad_norm": 1.0393979811839362, "learning_rate": 1e-05, "loss": 0.8945, "step": 77 }, { "epoch": 0.23533848764850085, "grad_norm": 1.1503832361551256, "learning_rate": 1e-05, "loss": 0.9163, "step": 78 }, { "epoch": 0.23835564774655854, "grad_norm": 1.1141284366563844, "learning_rate": 1e-05, "loss": 0.9093, "step": 79 }, { "epoch": 0.24137280784461626, "grad_norm": 1.0708909622724119, "learning_rate": 1e-05, "loss": 0.9359, "step": 80 }, { "epoch": 0.24438996794267395, "grad_norm": 1.0342956680966762, "learning_rate": 1e-05, "loss": 0.9066, "step": 81 }, { "epoch": 0.24740712804073167, "grad_norm": 1.041231965271414, "learning_rate": 1e-05, "loss": 0.9042, "step": 82 }, { "epoch": 0.25042428813878936, "grad_norm": 1.0891891643003724, "learning_rate": 1e-05, "loss": 0.9157, "step": 83 }, { "epoch": 0.25344144823684706, "grad_norm": 0.9463606441668, "learning_rate": 1e-05, "loss": 0.8652, "step": 84 }, { "epoch": 0.25645860833490475, "grad_norm": 0.9943090136893116, "learning_rate": 1e-05, "loss": 0.9138, "step": 85 }, { "epoch": 0.2594757684329625, "grad_norm": 1.0622250754003486, "learning_rate": 1e-05, "loss": 0.9029, "step": 86 }, { "epoch": 0.2624929285310202, "grad_norm": 1.0190810445181397, "learning_rate": 1e-05, "loss": 0.8701, "step": 87 }, { "epoch": 0.2655100886290779, "grad_norm": 1.0471126343927182, "learning_rate": 1e-05, "loss": 0.9173, "step": 88 }, { "epoch": 0.26852724872713557, "grad_norm": 1.094441431551059, "learning_rate": 1e-05, "loss": 0.9539, "step": 89 }, { "epoch": 0.27154440882519326, "grad_norm": 1.02572224910356, "learning_rate": 1e-05, "loss": 0.9036, "step": 90 }, { "epoch": 0.274561568923251, "grad_norm": 1.0647463602381582, "learning_rate": 1e-05, "loss": 0.8895, "step": 91 }, { "epoch": 0.2775787290213087, "grad_norm": 1.0867418650830927, "learning_rate": 1e-05, "loss": 0.8903, "step": 92 }, { "epoch": 0.2805958891193664, "grad_norm": 1.0448439572661483, "learning_rate": 1e-05, "loss": 0.9013, "step": 93 }, { "epoch": 0.2836130492174241, "grad_norm": 1.0782737190600102, "learning_rate": 1e-05, "loss": 0.9088, "step": 94 }, { "epoch": 0.2866302093154818, "grad_norm": 1.0853475076773287, "learning_rate": 1e-05, "loss": 0.876, "step": 95 }, { "epoch": 0.2896473694135395, "grad_norm": 1.0978686199970553, "learning_rate": 1e-05, "loss": 0.9286, "step": 96 }, { "epoch": 0.2926645295115972, "grad_norm": 1.0673815458666747, "learning_rate": 1e-05, "loss": 0.8973, "step": 97 }, { "epoch": 0.2956816896096549, "grad_norm": 1.184691172374345, "learning_rate": 1e-05, "loss": 0.8944, "step": 98 }, { "epoch": 0.2986988497077126, "grad_norm": 0.9861386386631925, "learning_rate": 1e-05, "loss": 0.9048, "step": 99 }, { "epoch": 0.3017160098057703, "grad_norm": 1.1329936556607971, "learning_rate": 1e-05, "loss": 0.9401, "step": 100 }, { "epoch": 0.30473316990382804, "grad_norm": 0.9543730308331345, "learning_rate": 1e-05, "loss": 0.9149, "step": 101 }, { "epoch": 0.30775033000188573, "grad_norm": 1.1462193660108113, "learning_rate": 1e-05, "loss": 0.9071, "step": 102 }, { "epoch": 0.3107674900999434, "grad_norm": 1.0904581574091061, "learning_rate": 1e-05, "loss": 0.8971, "step": 103 }, { "epoch": 0.3137846501980011, "grad_norm": 1.0495767709008346, "learning_rate": 1e-05, "loss": 0.919, "step": 104 }, { "epoch": 0.31680181029605886, "grad_norm": 0.9640542155531093, "learning_rate": 1e-05, "loss": 0.8713, "step": 105 }, { "epoch": 0.31981897039411655, "grad_norm": 0.9610132463708185, "learning_rate": 1e-05, "loss": 0.8933, "step": 106 }, { "epoch": 0.32283613049217424, "grad_norm": 1.0843824473667214, "learning_rate": 1e-05, "loss": 0.9093, "step": 107 }, { "epoch": 0.32585329059023194, "grad_norm": 0.9945425158672618, "learning_rate": 1e-05, "loss": 0.9071, "step": 108 }, { "epoch": 0.32887045068828963, "grad_norm": 1.0765546524524479, "learning_rate": 1e-05, "loss": 0.865, "step": 109 }, { "epoch": 0.3318876107863474, "grad_norm": 1.01041125101344, "learning_rate": 1e-05, "loss": 0.8907, "step": 110 }, { "epoch": 0.33490477088440507, "grad_norm": 1.0874945222389107, "learning_rate": 1e-05, "loss": 0.894, "step": 111 }, { "epoch": 0.33792193098246276, "grad_norm": 1.0493603540512813, "learning_rate": 1e-05, "loss": 0.9138, "step": 112 }, { "epoch": 0.34093909108052045, "grad_norm": 1.0124123297722831, "learning_rate": 1e-05, "loss": 0.9149, "step": 113 }, { "epoch": 0.34395625117857814, "grad_norm": 1.0531088139319633, "learning_rate": 1e-05, "loss": 0.8856, "step": 114 }, { "epoch": 0.3469734112766359, "grad_norm": 0.9763912034026013, "learning_rate": 1e-05, "loss": 0.905, "step": 115 }, { "epoch": 0.3499905713746936, "grad_norm": 1.0488345908663115, "learning_rate": 1e-05, "loss": 0.8907, "step": 116 }, { "epoch": 0.3530077314727513, "grad_norm": 1.0142709732139472, "learning_rate": 1e-05, "loss": 0.8983, "step": 117 }, { "epoch": 0.35602489157080897, "grad_norm": 0.9983943961658387, "learning_rate": 1e-05, "loss": 0.8881, "step": 118 }, { "epoch": 0.35904205166886666, "grad_norm": 1.0281018778884328, "learning_rate": 1e-05, "loss": 0.8989, "step": 119 }, { "epoch": 0.3620592117669244, "grad_norm": 1.0050409064254446, "learning_rate": 1e-05, "loss": 0.9113, "step": 120 }, { "epoch": 0.3650763718649821, "grad_norm": 1.0317911761320575, "learning_rate": 1e-05, "loss": 0.8926, "step": 121 }, { "epoch": 0.3680935319630398, "grad_norm": 1.0867071456016715, "learning_rate": 1e-05, "loss": 0.8698, "step": 122 }, { "epoch": 0.3711106920610975, "grad_norm": 1.0792050089286005, "learning_rate": 1e-05, "loss": 0.8385, "step": 123 }, { "epoch": 0.37412785215915517, "grad_norm": 1.136911850598409, "learning_rate": 1e-05, "loss": 0.884, "step": 124 }, { "epoch": 0.3771450122572129, "grad_norm": 0.9468447682708495, "learning_rate": 1e-05, "loss": 0.9048, "step": 125 }, { "epoch": 0.3801621723552706, "grad_norm": 0.99640914665336, "learning_rate": 1e-05, "loss": 0.9092, "step": 126 }, { "epoch": 0.3831793324533283, "grad_norm": 1.0035906051414294, "learning_rate": 1e-05, "loss": 0.8612, "step": 127 }, { "epoch": 0.386196492551386, "grad_norm": 1.0443720188110175, "learning_rate": 1e-05, "loss": 0.898, "step": 128 }, { "epoch": 0.3892136526494437, "grad_norm": 1.0019284346638992, "learning_rate": 1e-05, "loss": 0.9194, "step": 129 }, { "epoch": 0.39223081274750143, "grad_norm": 0.9947835482545767, "learning_rate": 1e-05, "loss": 0.8823, "step": 130 }, { "epoch": 0.3952479728455591, "grad_norm": 1.117124665050851, "learning_rate": 1e-05, "loss": 0.8698, "step": 131 }, { "epoch": 0.3982651329436168, "grad_norm": 1.0902265836998275, "learning_rate": 1e-05, "loss": 0.9015, "step": 132 }, { "epoch": 0.4012822930416745, "grad_norm": 0.9890942371356929, "learning_rate": 1e-05, "loss": 0.8702, "step": 133 }, { "epoch": 0.4042994531397322, "grad_norm": 1.079316692788872, "learning_rate": 1e-05, "loss": 0.9274, "step": 134 }, { "epoch": 0.40731661323778995, "grad_norm": 1.0507683546070807, "learning_rate": 1e-05, "loss": 0.8548, "step": 135 }, { "epoch": 0.41033377333584764, "grad_norm": 1.1173910988640408, "learning_rate": 1e-05, "loss": 0.8663, "step": 136 }, { "epoch": 0.41335093343390533, "grad_norm": 1.073210708775559, "learning_rate": 1e-05, "loss": 0.8612, "step": 137 }, { "epoch": 0.416368093531963, "grad_norm": 1.0396491185269086, "learning_rate": 1e-05, "loss": 0.8994, "step": 138 }, { "epoch": 0.4193852536300207, "grad_norm": 1.1500475629157627, "learning_rate": 1e-05, "loss": 0.8525, "step": 139 }, { "epoch": 0.42240241372807846, "grad_norm": 1.2168830597788158, "learning_rate": 1e-05, "loss": 0.9374, "step": 140 }, { "epoch": 0.42541957382613615, "grad_norm": 0.9553833702288597, "learning_rate": 1e-05, "loss": 0.9103, "step": 141 }, { "epoch": 0.42843673392419385, "grad_norm": 0.9850371340573859, "learning_rate": 1e-05, "loss": 0.8806, "step": 142 }, { "epoch": 0.43145389402225154, "grad_norm": 1.1473904156348889, "learning_rate": 1e-05, "loss": 0.8773, "step": 143 }, { "epoch": 0.4344710541203093, "grad_norm": 1.160071820812732, "learning_rate": 1e-05, "loss": 0.8672, "step": 144 }, { "epoch": 0.437488214218367, "grad_norm": 1.0914396615807336, "learning_rate": 1e-05, "loss": 0.8916, "step": 145 }, { "epoch": 0.44050537431642467, "grad_norm": 0.9961823995771022, "learning_rate": 1e-05, "loss": 0.8801, "step": 146 }, { "epoch": 0.44352253441448236, "grad_norm": 1.1627229625741373, "learning_rate": 1e-05, "loss": 0.8486, "step": 147 }, { "epoch": 0.44653969451254005, "grad_norm": 1.088993191172265, "learning_rate": 1e-05, "loss": 0.8541, "step": 148 }, { "epoch": 0.4495568546105978, "grad_norm": 0.9925318555430557, "learning_rate": 1e-05, "loss": 0.8823, "step": 149 }, { "epoch": 0.4525740147086555, "grad_norm": 1.080125550684176, "learning_rate": 1e-05, "loss": 0.8916, "step": 150 }, { "epoch": 0.4555911748067132, "grad_norm": 1.0335542992312043, "learning_rate": 1e-05, "loss": 0.8523, "step": 151 }, { "epoch": 0.4586083349047709, "grad_norm": 1.1905484694931652, "learning_rate": 1e-05, "loss": 0.8861, "step": 152 }, { "epoch": 0.46162549500282857, "grad_norm": 1.0739628080299815, "learning_rate": 1e-05, "loss": 0.8335, "step": 153 }, { "epoch": 0.4646426551008863, "grad_norm": 1.0040085132370382, "learning_rate": 1e-05, "loss": 0.867, "step": 154 }, { "epoch": 0.467659815198944, "grad_norm": 1.056602846699296, "learning_rate": 1e-05, "loss": 0.8579, "step": 155 }, { "epoch": 0.4706769752970017, "grad_norm": 0.9891380470884372, "learning_rate": 1e-05, "loss": 0.8454, "step": 156 }, { "epoch": 0.4736941353950594, "grad_norm": 0.9790395676599171, "learning_rate": 1e-05, "loss": 0.896, "step": 157 }, { "epoch": 0.4767112954931171, "grad_norm": 1.027120791419062, "learning_rate": 1e-05, "loss": 0.924, "step": 158 }, { "epoch": 0.47972845559117483, "grad_norm": 1.0959500600764212, "learning_rate": 1e-05, "loss": 0.8666, "step": 159 }, { "epoch": 0.4827456156892325, "grad_norm": 1.0401653704511453, "learning_rate": 1e-05, "loss": 0.8685, "step": 160 }, { "epoch": 0.4857627757872902, "grad_norm": 1.0428336555982345, "learning_rate": 1e-05, "loss": 0.8739, "step": 161 }, { "epoch": 0.4887799358853479, "grad_norm": 1.0035589157775742, "learning_rate": 1e-05, "loss": 0.9231, "step": 162 }, { "epoch": 0.4917970959834056, "grad_norm": 1.1332138472380069, "learning_rate": 1e-05, "loss": 0.8874, "step": 163 }, { "epoch": 0.49481425608146334, "grad_norm": 0.9889223927200703, "learning_rate": 1e-05, "loss": 0.9152, "step": 164 }, { "epoch": 0.49783141617952104, "grad_norm": 0.9903630601730351, "learning_rate": 1e-05, "loss": 0.8619, "step": 165 }, { "epoch": 0.5008485762775787, "grad_norm": 0.9870755137328874, "learning_rate": 1e-05, "loss": 0.8652, "step": 166 }, { "epoch": 0.5038657363756365, "grad_norm": 1.0295502212015513, "learning_rate": 1e-05, "loss": 0.8701, "step": 167 }, { "epoch": 0.5068828964736941, "grad_norm": 0.9980789726799827, "learning_rate": 1e-05, "loss": 0.8948, "step": 168 }, { "epoch": 0.5099000565717519, "grad_norm": 0.9895924927460445, "learning_rate": 1e-05, "loss": 0.8652, "step": 169 }, { "epoch": 0.5129172166698095, "grad_norm": 0.9897691179372796, "learning_rate": 1e-05, "loss": 0.8872, "step": 170 }, { "epoch": 0.5159343767678672, "grad_norm": 1.0976883085536995, "learning_rate": 1e-05, "loss": 0.8663, "step": 171 }, { "epoch": 0.518951536865925, "grad_norm": 0.9855831526257474, "learning_rate": 1e-05, "loss": 0.8379, "step": 172 }, { "epoch": 0.5219686969639826, "grad_norm": 1.007385512988202, "learning_rate": 1e-05, "loss": 0.8806, "step": 173 }, { "epoch": 0.5249858570620404, "grad_norm": 0.9811416651561021, "learning_rate": 1e-05, "loss": 0.8812, "step": 174 }, { "epoch": 0.528003017160098, "grad_norm": 1.008567949000168, "learning_rate": 1e-05, "loss": 0.9088, "step": 175 }, { "epoch": 0.5310201772581558, "grad_norm": 1.0735249094278072, "learning_rate": 1e-05, "loss": 0.8625, "step": 176 }, { "epoch": 0.5340373373562135, "grad_norm": 1.0099080610350626, "learning_rate": 1e-05, "loss": 0.8774, "step": 177 }, { "epoch": 0.5370544974542711, "grad_norm": 0.9869119389250737, "learning_rate": 1e-05, "loss": 0.8602, "step": 178 }, { "epoch": 0.5400716575523289, "grad_norm": 1.0140325571494992, "learning_rate": 1e-05, "loss": 0.8951, "step": 179 }, { "epoch": 0.5430888176503865, "grad_norm": 0.9746000686910684, "learning_rate": 1e-05, "loss": 0.8546, "step": 180 }, { "epoch": 0.5461059777484443, "grad_norm": 0.9912039011517177, "learning_rate": 1e-05, "loss": 0.8699, "step": 181 }, { "epoch": 0.549123137846502, "grad_norm": 0.9866319014883533, "learning_rate": 1e-05, "loss": 0.8708, "step": 182 }, { "epoch": 0.5521402979445597, "grad_norm": 0.9316538866319553, "learning_rate": 1e-05, "loss": 0.8759, "step": 183 }, { "epoch": 0.5551574580426174, "grad_norm": 0.9917975239418116, "learning_rate": 1e-05, "loss": 0.8743, "step": 184 }, { "epoch": 0.558174618140675, "grad_norm": 0.998938334137341, "learning_rate": 1e-05, "loss": 0.8353, "step": 185 }, { "epoch": 0.5611917782387328, "grad_norm": 1.0536268087144185, "learning_rate": 1e-05, "loss": 0.8892, "step": 186 }, { "epoch": 0.5642089383367905, "grad_norm": 1.0702585591377483, "learning_rate": 1e-05, "loss": 0.8459, "step": 187 }, { "epoch": 0.5672260984348482, "grad_norm": 1.0196433385874035, "learning_rate": 1e-05, "loss": 0.92, "step": 188 }, { "epoch": 0.5702432585329059, "grad_norm": 1.065268517886477, "learning_rate": 1e-05, "loss": 0.8497, "step": 189 }, { "epoch": 0.5732604186309636, "grad_norm": 0.9934866856892721, "learning_rate": 1e-05, "loss": 0.8795, "step": 190 }, { "epoch": 0.5762775787290213, "grad_norm": 0.982727769285356, "learning_rate": 1e-05, "loss": 0.9056, "step": 191 }, { "epoch": 0.579294738827079, "grad_norm": 1.0628075311898817, "learning_rate": 1e-05, "loss": 0.8588, "step": 192 }, { "epoch": 0.5823118989251367, "grad_norm": 0.9227204348296366, "learning_rate": 1e-05, "loss": 0.8569, "step": 193 }, { "epoch": 0.5853290590231944, "grad_norm": 0.9899338734692382, "learning_rate": 1e-05, "loss": 0.8857, "step": 194 }, { "epoch": 0.5883462191212521, "grad_norm": 1.0191415487699054, "learning_rate": 1e-05, "loss": 0.8959, "step": 195 }, { "epoch": 0.5913633792193098, "grad_norm": 1.0972049175262022, "learning_rate": 1e-05, "loss": 0.8473, "step": 196 }, { "epoch": 0.5943805393173676, "grad_norm": 0.9801443916922342, "learning_rate": 1e-05, "loss": 0.8882, "step": 197 }, { "epoch": 0.5973976994154252, "grad_norm": 1.05420177137371, "learning_rate": 1e-05, "loss": 0.8617, "step": 198 }, { "epoch": 0.600414859513483, "grad_norm": 0.9763149595538684, "learning_rate": 1e-05, "loss": 0.8317, "step": 199 }, { "epoch": 0.6034320196115406, "grad_norm": 1.0142128497052438, "learning_rate": 1e-05, "loss": 0.8801, "step": 200 }, { "epoch": 0.6064491797095983, "grad_norm": 1.1223250391910486, "learning_rate": 1e-05, "loss": 0.8651, "step": 201 }, { "epoch": 0.6094663398076561, "grad_norm": 0.9445148128595584, "learning_rate": 1e-05, "loss": 0.9247, "step": 202 }, { "epoch": 0.6124834999057137, "grad_norm": 1.0205891212652056, "learning_rate": 1e-05, "loss": 0.8568, "step": 203 }, { "epoch": 0.6155006600037715, "grad_norm": 1.0132942213261507, "learning_rate": 1e-05, "loss": 0.8474, "step": 204 }, { "epoch": 0.6185178201018292, "grad_norm": 1.0654107515701503, "learning_rate": 1e-05, "loss": 0.8436, "step": 205 }, { "epoch": 0.6215349801998868, "grad_norm": 0.9832515183706407, "learning_rate": 1e-05, "loss": 0.8357, "step": 206 }, { "epoch": 0.6245521402979446, "grad_norm": 1.0121196517905966, "learning_rate": 1e-05, "loss": 0.8852, "step": 207 }, { "epoch": 0.6275693003960022, "grad_norm": 1.2152715233993958, "learning_rate": 1e-05, "loss": 0.8215, "step": 208 }, { "epoch": 0.63058646049406, "grad_norm": 1.0114694608145163, "learning_rate": 1e-05, "loss": 0.865, "step": 209 }, { "epoch": 0.6336036205921177, "grad_norm": 1.0466026516397904, "learning_rate": 1e-05, "loss": 0.8566, "step": 210 }, { "epoch": 0.6366207806901754, "grad_norm": 0.9459705410503775, "learning_rate": 1e-05, "loss": 0.8327, "step": 211 }, { "epoch": 0.6396379407882331, "grad_norm": 0.9749006528453033, "learning_rate": 1e-05, "loss": 0.8642, "step": 212 }, { "epoch": 0.6426551008862907, "grad_norm": 0.9590985337067398, "learning_rate": 1e-05, "loss": 0.8447, "step": 213 }, { "epoch": 0.6456722609843485, "grad_norm": 1.097321284543017, "learning_rate": 1e-05, "loss": 0.8272, "step": 214 }, { "epoch": 0.6486894210824062, "grad_norm": 1.0054521678760522, "learning_rate": 1e-05, "loss": 0.8472, "step": 215 }, { "epoch": 0.6517065811804639, "grad_norm": 1.0099516647809557, "learning_rate": 1e-05, "loss": 0.8654, "step": 216 }, { "epoch": 0.6547237412785216, "grad_norm": 0.9567498655316358, "learning_rate": 1e-05, "loss": 0.8566, "step": 217 }, { "epoch": 0.6577409013765793, "grad_norm": 0.9670178838314303, "learning_rate": 1e-05, "loss": 0.8356, "step": 218 }, { "epoch": 0.660758061474637, "grad_norm": 0.9767210115729468, "learning_rate": 1e-05, "loss": 0.9223, "step": 219 }, { "epoch": 0.6637752215726948, "grad_norm": 1.0035943684822244, "learning_rate": 1e-05, "loss": 0.8778, "step": 220 }, { "epoch": 0.6667923816707524, "grad_norm": 0.9836721726138078, "learning_rate": 1e-05, "loss": 0.8601, "step": 221 }, { "epoch": 0.6698095417688101, "grad_norm": 1.0555276758382468, "learning_rate": 1e-05, "loss": 0.8687, "step": 222 }, { "epoch": 0.6728267018668678, "grad_norm": 1.0236557982823795, "learning_rate": 1e-05, "loss": 0.869, "step": 223 }, { "epoch": 0.6758438619649255, "grad_norm": 1.0490641185205225, "learning_rate": 1e-05, "loss": 0.8513, "step": 224 }, { "epoch": 0.6788610220629833, "grad_norm": 1.0112063959996538, "learning_rate": 1e-05, "loss": 0.8671, "step": 225 }, { "epoch": 0.6818781821610409, "grad_norm": 0.9797715955787868, "learning_rate": 1e-05, "loss": 0.8721, "step": 226 }, { "epoch": 0.6848953422590986, "grad_norm": 1.023163793890387, "learning_rate": 1e-05, "loss": 0.8238, "step": 227 }, { "epoch": 0.6879125023571563, "grad_norm": 0.9901311938407963, "learning_rate": 1e-05, "loss": 0.8455, "step": 228 }, { "epoch": 0.690929662455214, "grad_norm": 1.0333937580724437, "learning_rate": 1e-05, "loss": 0.8799, "step": 229 }, { "epoch": 0.6939468225532718, "grad_norm": 0.9781251267252234, "learning_rate": 1e-05, "loss": 0.8558, "step": 230 }, { "epoch": 0.6969639826513294, "grad_norm": 0.9991779679723747, "learning_rate": 1e-05, "loss": 0.8534, "step": 231 }, { "epoch": 0.6999811427493872, "grad_norm": 1.0389629537546219, "learning_rate": 1e-05, "loss": 0.8466, "step": 232 }, { "epoch": 0.7029983028474448, "grad_norm": 1.0204794940839597, "learning_rate": 1e-05, "loss": 0.832, "step": 233 }, { "epoch": 0.7060154629455025, "grad_norm": 1.0124333073703313, "learning_rate": 1e-05, "loss": 0.826, "step": 234 }, { "epoch": 0.7090326230435603, "grad_norm": 0.9469683819775043, "learning_rate": 1e-05, "loss": 0.8746, "step": 235 }, { "epoch": 0.7120497831416179, "grad_norm": 0.9681088394641286, "learning_rate": 1e-05, "loss": 0.8777, "step": 236 }, { "epoch": 0.7150669432396757, "grad_norm": 0.9534971580611808, "learning_rate": 1e-05, "loss": 0.828, "step": 237 }, { "epoch": 0.7180841033377333, "grad_norm": 0.9231164924550516, "learning_rate": 1e-05, "loss": 0.8851, "step": 238 }, { "epoch": 0.7211012634357911, "grad_norm": 0.9564412650063757, "learning_rate": 1e-05, "loss": 0.8407, "step": 239 }, { "epoch": 0.7241184235338488, "grad_norm": 0.9387537660650889, "learning_rate": 1e-05, "loss": 0.8195, "step": 240 }, { "epoch": 0.7271355836319064, "grad_norm": 0.984184826582449, "learning_rate": 1e-05, "loss": 0.8417, "step": 241 }, { "epoch": 0.7301527437299642, "grad_norm": 0.9680344075351693, "learning_rate": 1e-05, "loss": 0.8679, "step": 242 }, { "epoch": 0.7331699038280218, "grad_norm": 0.9508741001956648, "learning_rate": 1e-05, "loss": 0.8182, "step": 243 }, { "epoch": 0.7361870639260796, "grad_norm": 0.9991310599384764, "learning_rate": 1e-05, "loss": 0.8917, "step": 244 }, { "epoch": 0.7392042240241373, "grad_norm": 1.0472945255103463, "learning_rate": 1e-05, "loss": 0.9124, "step": 245 }, { "epoch": 0.742221384122195, "grad_norm": 0.99539173598333, "learning_rate": 1e-05, "loss": 0.8638, "step": 246 }, { "epoch": 0.7452385442202527, "grad_norm": 1.0251801530213278, "learning_rate": 1e-05, "loss": 0.8413, "step": 247 }, { "epoch": 0.7482557043183103, "grad_norm": 1.0408585876445724, "learning_rate": 1e-05, "loss": 0.8381, "step": 248 }, { "epoch": 0.7512728644163681, "grad_norm": 0.9731694388232142, "learning_rate": 1e-05, "loss": 0.855, "step": 249 }, { "epoch": 0.7542900245144258, "grad_norm": 1.0030115847404013, "learning_rate": 1e-05, "loss": 0.8596, "step": 250 }, { "epoch": 0.7573071846124835, "grad_norm": 0.9841974420990679, "learning_rate": 1e-05, "loss": 0.8742, "step": 251 }, { "epoch": 0.7603243447105412, "grad_norm": 1.055927374179432, "learning_rate": 1e-05, "loss": 0.8286, "step": 252 }, { "epoch": 0.7633415048085989, "grad_norm": 1.0037342058458762, "learning_rate": 1e-05, "loss": 0.8624, "step": 253 }, { "epoch": 0.7663586649066566, "grad_norm": 0.9779177922525308, "learning_rate": 1e-05, "loss": 0.8224, "step": 254 }, { "epoch": 0.7693758250047144, "grad_norm": 1.0485152455162672, "learning_rate": 1e-05, "loss": 0.8353, "step": 255 }, { "epoch": 0.772392985102772, "grad_norm": 0.9427366336442022, "learning_rate": 1e-05, "loss": 0.859, "step": 256 }, { "epoch": 0.7754101452008297, "grad_norm": 0.9826256006927524, "learning_rate": 1e-05, "loss": 0.8781, "step": 257 }, { "epoch": 0.7784273052988874, "grad_norm": 1.097163176429486, "learning_rate": 1e-05, "loss": 0.8603, "step": 258 }, { "epoch": 0.7814444653969451, "grad_norm": 1.0755684776060006, "learning_rate": 1e-05, "loss": 0.8803, "step": 259 }, { "epoch": 0.7844616254950029, "grad_norm": 1.060543596503769, "learning_rate": 1e-05, "loss": 0.8749, "step": 260 }, { "epoch": 0.7874787855930605, "grad_norm": 1.0054727401343206, "learning_rate": 1e-05, "loss": 0.8656, "step": 261 }, { "epoch": 0.7904959456911183, "grad_norm": 1.0209085800021502, "learning_rate": 1e-05, "loss": 0.8821, "step": 262 }, { "epoch": 0.7935131057891759, "grad_norm": 0.9243375424290232, "learning_rate": 1e-05, "loss": 0.8622, "step": 263 }, { "epoch": 0.7965302658872336, "grad_norm": 1.0429931807106287, "learning_rate": 1e-05, "loss": 0.8196, "step": 264 }, { "epoch": 0.7995474259852914, "grad_norm": 1.0354133838924724, "learning_rate": 1e-05, "loss": 0.8339, "step": 265 }, { "epoch": 0.802564586083349, "grad_norm": 0.9226609889690002, "learning_rate": 1e-05, "loss": 0.854, "step": 266 }, { "epoch": 0.8055817461814068, "grad_norm": 0.8608785777678977, "learning_rate": 1e-05, "loss": 0.8371, "step": 267 }, { "epoch": 0.8085989062794644, "grad_norm": 1.0583845408082262, "learning_rate": 1e-05, "loss": 0.8158, "step": 268 }, { "epoch": 0.8116160663775221, "grad_norm": 0.9488390352752252, "learning_rate": 1e-05, "loss": 0.8569, "step": 269 }, { "epoch": 0.8146332264755799, "grad_norm": 1.040418044582926, "learning_rate": 1e-05, "loss": 0.847, "step": 270 }, { "epoch": 0.8176503865736375, "grad_norm": 1.1259258576677653, "learning_rate": 1e-05, "loss": 0.8178, "step": 271 }, { "epoch": 0.8206675466716953, "grad_norm": 0.9418385845633765, "learning_rate": 1e-05, "loss": 0.8293, "step": 272 }, { "epoch": 0.8236847067697529, "grad_norm": 0.9498163839438614, "learning_rate": 1e-05, "loss": 0.8472, "step": 273 }, { "epoch": 0.8267018668678107, "grad_norm": 0.9864544845499665, "learning_rate": 1e-05, "loss": 0.8751, "step": 274 }, { "epoch": 0.8297190269658684, "grad_norm": 1.0406919672946373, "learning_rate": 1e-05, "loss": 0.798, "step": 275 }, { "epoch": 0.832736187063926, "grad_norm": 1.0806255336515647, "learning_rate": 1e-05, "loss": 0.8282, "step": 276 }, { "epoch": 0.8357533471619838, "grad_norm": 0.9860936672416695, "learning_rate": 1e-05, "loss": 0.858, "step": 277 }, { "epoch": 0.8387705072600414, "grad_norm": 0.9718313802197633, "learning_rate": 1e-05, "loss": 0.8466, "step": 278 }, { "epoch": 0.8417876673580992, "grad_norm": 1.0544122713831847, "learning_rate": 1e-05, "loss": 0.8744, "step": 279 }, { "epoch": 0.8448048274561569, "grad_norm": 1.081992381916025, "learning_rate": 1e-05, "loss": 0.8743, "step": 280 }, { "epoch": 0.8478219875542146, "grad_norm": 0.9532379328140087, "learning_rate": 1e-05, "loss": 0.857, "step": 281 }, { "epoch": 0.8508391476522723, "grad_norm": 0.9411887622889539, "learning_rate": 1e-05, "loss": 0.8408, "step": 282 }, { "epoch": 0.8538563077503301, "grad_norm": 0.9619536563065685, "learning_rate": 1e-05, "loss": 0.8501, "step": 283 }, { "epoch": 0.8568734678483877, "grad_norm": 0.9802987123695353, "learning_rate": 1e-05, "loss": 0.8235, "step": 284 }, { "epoch": 0.8598906279464454, "grad_norm": 1.0032810101495542, "learning_rate": 1e-05, "loss": 0.8516, "step": 285 }, { "epoch": 0.8629077880445031, "grad_norm": 0.9916945079271281, "learning_rate": 1e-05, "loss": 0.8324, "step": 286 }, { "epoch": 0.8659249481425608, "grad_norm": 0.9901223884800562, "learning_rate": 1e-05, "loss": 0.8408, "step": 287 }, { "epoch": 0.8689421082406186, "grad_norm": 0.9835034534966476, "learning_rate": 1e-05, "loss": 0.8495, "step": 288 }, { "epoch": 0.8719592683386762, "grad_norm": 0.9750801588302662, "learning_rate": 1e-05, "loss": 0.8563, "step": 289 }, { "epoch": 0.874976428436734, "grad_norm": 0.9641976460403898, "learning_rate": 1e-05, "loss": 0.8421, "step": 290 }, { "epoch": 0.8779935885347916, "grad_norm": 0.9501946334304521, "learning_rate": 1e-05, "loss": 0.8654, "step": 291 }, { "epoch": 0.8810107486328493, "grad_norm": 0.9100014151244762, "learning_rate": 1e-05, "loss": 0.8631, "step": 292 }, { "epoch": 0.8840279087309071, "grad_norm": 0.9944179407196427, "learning_rate": 1e-05, "loss": 0.8464, "step": 293 }, { "epoch": 0.8870450688289647, "grad_norm": 0.9853713974171726, "learning_rate": 1e-05, "loss": 0.8605, "step": 294 }, { "epoch": 0.8900622289270225, "grad_norm": 0.9254523406321978, "learning_rate": 1e-05, "loss": 0.8478, "step": 295 }, { "epoch": 0.8930793890250801, "grad_norm": 0.8959456626105031, "learning_rate": 1e-05, "loss": 0.8457, "step": 296 }, { "epoch": 0.8960965491231379, "grad_norm": 1.0037255347844305, "learning_rate": 1e-05, "loss": 0.8576, "step": 297 }, { "epoch": 0.8991137092211956, "grad_norm": 0.9449138138513178, "learning_rate": 1e-05, "loss": 0.822, "step": 298 }, { "epoch": 0.9021308693192532, "grad_norm": 0.9597546807553757, "learning_rate": 1e-05, "loss": 0.7988, "step": 299 }, { "epoch": 0.905148029417311, "grad_norm": 0.9624792569398237, "learning_rate": 1e-05, "loss": 0.8389, "step": 300 }, { "epoch": 0.9081651895153686, "grad_norm": 0.9435277465551941, "learning_rate": 1e-05, "loss": 0.8722, "step": 301 }, { "epoch": 0.9111823496134264, "grad_norm": 0.9681970370411234, "learning_rate": 1e-05, "loss": 0.8183, "step": 302 }, { "epoch": 0.9141995097114841, "grad_norm": 0.9291647991127606, "learning_rate": 1e-05, "loss": 0.8076, "step": 303 }, { "epoch": 0.9172166698095418, "grad_norm": 1.0177809366807526, "learning_rate": 1e-05, "loss": 0.8645, "step": 304 }, { "epoch": 0.9202338299075995, "grad_norm": 0.9855696615573117, "learning_rate": 1e-05, "loss": 0.8546, "step": 305 }, { "epoch": 0.9232509900056571, "grad_norm": 0.9745085927690291, "learning_rate": 1e-05, "loss": 0.8442, "step": 306 }, { "epoch": 0.9262681501037149, "grad_norm": 0.9356489869997006, "learning_rate": 1e-05, "loss": 0.8571, "step": 307 }, { "epoch": 0.9292853102017726, "grad_norm": 1.0346965656565765, "learning_rate": 1e-05, "loss": 0.823, "step": 308 }, { "epoch": 0.9323024702998303, "grad_norm": 1.123418692750864, "learning_rate": 1e-05, "loss": 0.8717, "step": 309 }, { "epoch": 0.935319630397888, "grad_norm": 1.0319249558634143, "learning_rate": 1e-05, "loss": 0.8433, "step": 310 }, { "epoch": 0.9383367904959456, "grad_norm": 0.9462383670551003, "learning_rate": 1e-05, "loss": 0.8607, "step": 311 }, { "epoch": 0.9413539505940034, "grad_norm": 0.9441101670098899, "learning_rate": 1e-05, "loss": 0.8186, "step": 312 }, { "epoch": 0.9443711106920611, "grad_norm": 1.0504882586379691, "learning_rate": 1e-05, "loss": 0.845, "step": 313 }, { "epoch": 0.9473882707901188, "grad_norm": 0.9048369513823565, "learning_rate": 1e-05, "loss": 0.8174, "step": 314 }, { "epoch": 0.9504054308881765, "grad_norm": 0.9019137471005251, "learning_rate": 1e-05, "loss": 0.8556, "step": 315 }, { "epoch": 0.9534225909862342, "grad_norm": 0.8875508030205819, "learning_rate": 1e-05, "loss": 0.861, "step": 316 }, { "epoch": 0.9564397510842919, "grad_norm": 0.9993353590451418, "learning_rate": 1e-05, "loss": 0.8669, "step": 317 }, { "epoch": 0.9594569111823497, "grad_norm": 0.9488161087038608, "learning_rate": 1e-05, "loss": 0.878, "step": 318 }, { "epoch": 0.9624740712804073, "grad_norm": 0.9365808217541111, "learning_rate": 1e-05, "loss": 0.8222, "step": 319 }, { "epoch": 0.965491231378465, "grad_norm": 0.9799964559542612, "learning_rate": 1e-05, "loss": 0.8468, "step": 320 }, { "epoch": 0.9685083914765227, "grad_norm": 1.0661880405150197, "learning_rate": 1e-05, "loss": 0.8381, "step": 321 }, { "epoch": 0.9715255515745804, "grad_norm": 1.0265204942074564, "learning_rate": 1e-05, "loss": 0.8467, "step": 322 }, { "epoch": 0.9745427116726382, "grad_norm": 1.067821792666605, "learning_rate": 1e-05, "loss": 0.8351, "step": 323 }, { "epoch": 0.9775598717706958, "grad_norm": 1.1466057402506813, "learning_rate": 1e-05, "loss": 0.8551, "step": 324 }, { "epoch": 0.9805770318687536, "grad_norm": 0.9899578843967808, "learning_rate": 1e-05, "loss": 0.8248, "step": 325 }, { "epoch": 0.9835941919668112, "grad_norm": 0.8886227742234571, "learning_rate": 1e-05, "loss": 0.8527, "step": 326 }, { "epoch": 0.9866113520648689, "grad_norm": 0.9780662079081274, "learning_rate": 1e-05, "loss": 0.862, "step": 327 }, { "epoch": 0.9896285121629267, "grad_norm": 1.0240427016912055, "learning_rate": 1e-05, "loss": 0.8646, "step": 328 }, { "epoch": 0.9926456722609843, "grad_norm": 0.9797020588373134, "learning_rate": 1e-05, "loss": 0.8764, "step": 329 }, { "epoch": 0.9956628323590421, "grad_norm": 0.9980560765723044, "learning_rate": 1e-05, "loss": 0.8767, "step": 330 }, { "epoch": 0.9986799924570997, "grad_norm": 0.9334505105444044, "learning_rate": 1e-05, "loss": 0.8189, "step": 331 }, { "epoch": 0.9986799924570997, "step": 331, "total_flos": 437602977054720.0, "train_loss": 0.8955676523220143, "train_runtime": 47565.9142, "train_samples_per_second": 0.892, "train_steps_per_second": 0.007 } ], "logging_steps": 1.0, "max_steps": 331, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 437602977054720.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }