diff --git "a/checkpoint-2000/trainer_state.json" "b/checkpoint-2000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2000/trainer_state.json" @@ -0,0 +1,14033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.48297512678097076, + "eval_steps": 500, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002414875633904854, + "grad_norm": 0.48872238397598267, + "learning_rate": 9.638554216867472e-07, + "loss": 2.1188, + "step": 1 + }, + { + "epoch": 0.0004829751267809708, + "grad_norm": 0.4883142113685608, + "learning_rate": 1.9277108433734943e-06, + "loss": 1.943, + "step": 2 + }, + { + "epoch": 0.0007244626901714562, + "grad_norm": 2.160808563232422, + "learning_rate": 2.891566265060241e-06, + "loss": 2.3426, + "step": 3 + }, + { + "epoch": 0.0009659502535619416, + "grad_norm": 0.5656324625015259, + "learning_rate": 3.855421686746989e-06, + "loss": 2.0497, + "step": 4 + }, + { + "epoch": 0.001207437816952427, + "grad_norm": 0.5182572603225708, + "learning_rate": 4.819277108433735e-06, + "loss": 1.9081, + "step": 5 + }, + { + "epoch": 0.0014489253803429123, + "grad_norm": 0.615043044090271, + "learning_rate": 5.783132530120482e-06, + "loss": 2.0246, + "step": 6 + }, + { + "epoch": 0.0016904129437333977, + "grad_norm": 0.47701945900917053, + "learning_rate": 6.746987951807229e-06, + "loss": 1.9923, + "step": 7 + }, + { + "epoch": 0.001931900507123883, + "grad_norm": 0.4645046591758728, + "learning_rate": 7.710843373493977e-06, + "loss": 1.9992, + "step": 8 + }, + { + "epoch": 0.0021733880705143687, + "grad_norm": 0.6710774302482605, + "learning_rate": 8.674698795180724e-06, + "loss": 1.9561, + "step": 9 + }, + { + "epoch": 0.002414875633904854, + "grad_norm": 0.43727195262908936, + "learning_rate": 9.63855421686747e-06, + "loss": 1.9986, + "step": 10 + }, + { + "epoch": 0.0026563631972953395, + "grad_norm": 0.41306453943252563, + "learning_rate": 1.0602409638554219e-05, + "loss": 1.8657, + "step": 11 + }, + { + "epoch": 0.0028978507606858247, + "grad_norm": 0.496465802192688, + "learning_rate": 1.1566265060240964e-05, + "loss": 1.9444, + "step": 12 + }, + { + "epoch": 0.0031393383240763103, + "grad_norm": 0.40364280343055725, + "learning_rate": 1.2530120481927712e-05, + "loss": 2.0184, + "step": 13 + }, + { + "epoch": 0.0033808258874667954, + "grad_norm": 0.4289240539073944, + "learning_rate": 1.3493975903614458e-05, + "loss": 1.9886, + "step": 14 + }, + { + "epoch": 0.003622313450857281, + "grad_norm": 0.3964898884296417, + "learning_rate": 1.4457831325301207e-05, + "loss": 1.8049, + "step": 15 + }, + { + "epoch": 0.003863801014247766, + "grad_norm": 0.39897167682647705, + "learning_rate": 1.5421686746987955e-05, + "loss": 1.9805, + "step": 16 + }, + { + "epoch": 0.004105288577638252, + "grad_norm": 0.4459080696105957, + "learning_rate": 1.63855421686747e-05, + "loss": 1.9905, + "step": 17 + }, + { + "epoch": 0.004346776141028737, + "grad_norm": 0.7771973609924316, + "learning_rate": 1.7349397590361448e-05, + "loss": 2.2652, + "step": 18 + }, + { + "epoch": 0.004588263704419222, + "grad_norm": 0.4256933629512787, + "learning_rate": 1.8313253012048194e-05, + "loss": 2.0247, + "step": 19 + }, + { + "epoch": 0.004829751267809708, + "grad_norm": 0.41948211193084717, + "learning_rate": 1.927710843373494e-05, + "loss": 1.9011, + "step": 20 + }, + { + "epoch": 0.005071238831200193, + "grad_norm": 0.3880179524421692, + "learning_rate": 2.0240963855421687e-05, + "loss": 1.6799, + "step": 21 + }, + { + "epoch": 0.005312726394590679, + "grad_norm": 0.39275649189949036, + "learning_rate": 2.1204819277108437e-05, + "loss": 1.916, + "step": 22 + }, + { + "epoch": 0.005554213957981164, + "grad_norm": 0.35941553115844727, + "learning_rate": 2.2168674698795184e-05, + "loss": 1.7779, + "step": 23 + }, + { + "epoch": 0.005795701521371649, + "grad_norm": 0.4126398265361786, + "learning_rate": 2.3132530120481927e-05, + "loss": 2.004, + "step": 24 + }, + { + "epoch": 0.006037189084762135, + "grad_norm": 0.3780952990055084, + "learning_rate": 2.409638554216868e-05, + "loss": 1.8459, + "step": 25 + }, + { + "epoch": 0.0062786766481526205, + "grad_norm": 0.3541395366191864, + "learning_rate": 2.5060240963855423e-05, + "loss": 1.7157, + "step": 26 + }, + { + "epoch": 0.006520164211543105, + "grad_norm": 0.4550764858722687, + "learning_rate": 2.602409638554217e-05, + "loss": 1.8738, + "step": 27 + }, + { + "epoch": 0.006761651774933591, + "grad_norm": 0.4110875725746155, + "learning_rate": 2.6987951807228917e-05, + "loss": 1.7607, + "step": 28 + }, + { + "epoch": 0.0070031393383240765, + "grad_norm": 0.398453027009964, + "learning_rate": 2.7951807228915666e-05, + "loss": 1.9628, + "step": 29 + }, + { + "epoch": 0.007244626901714562, + "grad_norm": 0.3572748005390167, + "learning_rate": 2.8915662650602413e-05, + "loss": 1.775, + "step": 30 + }, + { + "epoch": 0.007486114465105047, + "grad_norm": 0.38363558053970337, + "learning_rate": 2.9879518072289156e-05, + "loss": 1.855, + "step": 31 + }, + { + "epoch": 0.007727602028495532, + "grad_norm": 0.392665296792984, + "learning_rate": 3.084337349397591e-05, + "loss": 2.0708, + "step": 32 + }, + { + "epoch": 0.007969089591886018, + "grad_norm": 0.42784029245376587, + "learning_rate": 3.180722891566265e-05, + "loss": 2.0002, + "step": 33 + }, + { + "epoch": 0.008210577155276504, + "grad_norm": 0.39450863003730774, + "learning_rate": 3.27710843373494e-05, + "loss": 1.7978, + "step": 34 + }, + { + "epoch": 0.00845206471866699, + "grad_norm": 0.37916016578674316, + "learning_rate": 3.373493975903615e-05, + "loss": 1.7597, + "step": 35 + }, + { + "epoch": 0.008693552282057475, + "grad_norm": 0.3838157653808594, + "learning_rate": 3.4698795180722896e-05, + "loss": 1.7366, + "step": 36 + }, + { + "epoch": 0.008935039845447959, + "grad_norm": 0.39187654852867126, + "learning_rate": 3.566265060240964e-05, + "loss": 1.7743, + "step": 37 + }, + { + "epoch": 0.009176527408838444, + "grad_norm": 0.4216479957103729, + "learning_rate": 3.662650602409639e-05, + "loss": 1.9526, + "step": 38 + }, + { + "epoch": 0.00941801497222893, + "grad_norm": 0.3791981637477875, + "learning_rate": 3.759036144578314e-05, + "loss": 1.8637, + "step": 39 + }, + { + "epoch": 0.009659502535619416, + "grad_norm": 0.4517281949520111, + "learning_rate": 3.855421686746988e-05, + "loss": 1.9789, + "step": 40 + }, + { + "epoch": 0.009900990099009901, + "grad_norm": 0.3904320001602173, + "learning_rate": 3.9518072289156625e-05, + "loss": 1.9162, + "step": 41 + }, + { + "epoch": 0.010142477662400387, + "grad_norm": 0.39694979786872864, + "learning_rate": 4.0481927710843375e-05, + "loss": 2.0246, + "step": 42 + }, + { + "epoch": 0.010383965225790872, + "grad_norm": 0.39392992854118347, + "learning_rate": 4.1445783132530125e-05, + "loss": 1.8925, + "step": 43 + }, + { + "epoch": 0.010625452789181358, + "grad_norm": 0.3753025233745575, + "learning_rate": 4.2409638554216875e-05, + "loss": 1.777, + "step": 44 + }, + { + "epoch": 0.010866940352571842, + "grad_norm": 0.35296690464019775, + "learning_rate": 4.337349397590362e-05, + "loss": 1.7254, + "step": 45 + }, + { + "epoch": 0.011108427915962327, + "grad_norm": 0.39575520157814026, + "learning_rate": 4.433734939759037e-05, + "loss": 1.819, + "step": 46 + }, + { + "epoch": 0.011349915479352813, + "grad_norm": 0.415618896484375, + "learning_rate": 4.530120481927712e-05, + "loss": 1.9398, + "step": 47 + }, + { + "epoch": 0.011591403042743299, + "grad_norm": 0.3653118908405304, + "learning_rate": 4.6265060240963854e-05, + "loss": 1.7664, + "step": 48 + }, + { + "epoch": 0.011832890606133784, + "grad_norm": 0.38401493430137634, + "learning_rate": 4.7228915662650604e-05, + "loss": 1.9299, + "step": 49 + }, + { + "epoch": 0.01207437816952427, + "grad_norm": 0.4112469255924225, + "learning_rate": 4.819277108433736e-05, + "loss": 1.9618, + "step": 50 + }, + { + "epoch": 0.012315865732914755, + "grad_norm": 0.39517056941986084, + "learning_rate": 4.91566265060241e-05, + "loss": 2.0678, + "step": 51 + }, + { + "epoch": 0.012557353296305241, + "grad_norm": 0.38852378726005554, + "learning_rate": 5.012048192771085e-05, + "loss": 1.9389, + "step": 52 + }, + { + "epoch": 0.012798840859695725, + "grad_norm": 0.392365425825119, + "learning_rate": 5.108433734939759e-05, + "loss": 1.872, + "step": 53 + }, + { + "epoch": 0.01304032842308621, + "grad_norm": 0.40039297938346863, + "learning_rate": 5.204819277108434e-05, + "loss": 1.9234, + "step": 54 + }, + { + "epoch": 0.013281815986476696, + "grad_norm": 0.37631353735923767, + "learning_rate": 5.301204819277109e-05, + "loss": 1.8483, + "step": 55 + }, + { + "epoch": 0.013523303549867182, + "grad_norm": 0.3847208321094513, + "learning_rate": 5.397590361445783e-05, + "loss": 1.7396, + "step": 56 + }, + { + "epoch": 0.013764791113257667, + "grad_norm": 0.43836677074432373, + "learning_rate": 5.493975903614458e-05, + "loss": 2.1202, + "step": 57 + }, + { + "epoch": 0.014006278676648153, + "grad_norm": 0.4151008427143097, + "learning_rate": 5.590361445783133e-05, + "loss": 1.9056, + "step": 58 + }, + { + "epoch": 0.014247766240038639, + "grad_norm": 0.4057491719722748, + "learning_rate": 5.6867469879518076e-05, + "loss": 1.8731, + "step": 59 + }, + { + "epoch": 0.014489253803429124, + "grad_norm": 0.39896196126937866, + "learning_rate": 5.7831325301204826e-05, + "loss": 1.7901, + "step": 60 + }, + { + "epoch": 0.014730741366819608, + "grad_norm": 0.5027028322219849, + "learning_rate": 5.8795180722891576e-05, + "loss": 2.176, + "step": 61 + }, + { + "epoch": 0.014972228930210094, + "grad_norm": 0.41533949971199036, + "learning_rate": 5.975903614457831e-05, + "loss": 1.8349, + "step": 62 + }, + { + "epoch": 0.01521371649360058, + "grad_norm": 0.41627174615859985, + "learning_rate": 6.072289156626506e-05, + "loss": 1.8164, + "step": 63 + }, + { + "epoch": 0.015455204056991065, + "grad_norm": 0.3680180311203003, + "learning_rate": 6.168674698795182e-05, + "loss": 1.7825, + "step": 64 + }, + { + "epoch": 0.01569669162038155, + "grad_norm": 0.3980069160461426, + "learning_rate": 6.265060240963856e-05, + "loss": 1.8251, + "step": 65 + }, + { + "epoch": 0.015938179183772036, + "grad_norm": 0.3967473804950714, + "learning_rate": 6.36144578313253e-05, + "loss": 1.8168, + "step": 66 + }, + { + "epoch": 0.01617966674716252, + "grad_norm": 0.3991287052631378, + "learning_rate": 6.457831325301206e-05, + "loss": 1.8828, + "step": 67 + }, + { + "epoch": 0.016421154310553007, + "grad_norm": 0.4125327467918396, + "learning_rate": 6.55421686746988e-05, + "loss": 1.848, + "step": 68 + }, + { + "epoch": 0.016662641873943493, + "grad_norm": 0.37583857774734497, + "learning_rate": 6.650602409638555e-05, + "loss": 1.7656, + "step": 69 + }, + { + "epoch": 0.01690412943733398, + "grad_norm": 0.43856287002563477, + "learning_rate": 6.74698795180723e-05, + "loss": 1.9077, + "step": 70 + }, + { + "epoch": 0.017145617000724464, + "grad_norm": 0.39317071437835693, + "learning_rate": 6.843373493975903e-05, + "loss": 1.8317, + "step": 71 + }, + { + "epoch": 0.01738710456411495, + "grad_norm": 0.3993190824985504, + "learning_rate": 6.939759036144579e-05, + "loss": 1.8451, + "step": 72 + }, + { + "epoch": 0.017628592127505432, + "grad_norm": 0.3683207333087921, + "learning_rate": 7.036144578313253e-05, + "loss": 1.7778, + "step": 73 + }, + { + "epoch": 0.017870079690895917, + "grad_norm": 0.38704434037208557, + "learning_rate": 7.132530120481928e-05, + "loss": 1.8159, + "step": 74 + }, + { + "epoch": 0.018111567254286403, + "grad_norm": 0.42196622490882874, + "learning_rate": 7.228915662650603e-05, + "loss": 2.1045, + "step": 75 + }, + { + "epoch": 0.01835305481767689, + "grad_norm": 0.3692149817943573, + "learning_rate": 7.325301204819278e-05, + "loss": 1.7807, + "step": 76 + }, + { + "epoch": 0.018594542381067374, + "grad_norm": 0.3880510926246643, + "learning_rate": 7.421686746987952e-05, + "loss": 1.7362, + "step": 77 + }, + { + "epoch": 0.01883602994445786, + "grad_norm": 0.379742830991745, + "learning_rate": 7.518072289156628e-05, + "loss": 1.8806, + "step": 78 + }, + { + "epoch": 0.019077517507848345, + "grad_norm": 0.3501541018486023, + "learning_rate": 7.614457831325302e-05, + "loss": 1.6607, + "step": 79 + }, + { + "epoch": 0.01931900507123883, + "grad_norm": 0.3936968743801117, + "learning_rate": 7.710843373493976e-05, + "loss": 1.9365, + "step": 80 + }, + { + "epoch": 0.019560492634629317, + "grad_norm": 0.3812267780303955, + "learning_rate": 7.807228915662652e-05, + "loss": 1.8093, + "step": 81 + }, + { + "epoch": 0.019801980198019802, + "grad_norm": 0.3729088604450226, + "learning_rate": 7.903614457831325e-05, + "loss": 1.7508, + "step": 82 + }, + { + "epoch": 0.020043467761410288, + "grad_norm": 0.36335960030555725, + "learning_rate": 8e-05, + "loss": 1.7563, + "step": 83 + }, + { + "epoch": 0.020284955324800773, + "grad_norm": 0.3932444155216217, + "learning_rate": 7.999998801313446e-05, + "loss": 1.9381, + "step": 84 + }, + { + "epoch": 0.02052644288819126, + "grad_norm": 0.37464866042137146, + "learning_rate": 7.9999952052545e-05, + "loss": 1.897, + "step": 85 + }, + { + "epoch": 0.020767930451581745, + "grad_norm": 0.5091702938079834, + "learning_rate": 7.99998921182532e-05, + "loss": 2.0178, + "step": 86 + }, + { + "epoch": 0.02100941801497223, + "grad_norm": 0.35622596740722656, + "learning_rate": 7.999980821029496e-05, + "loss": 1.7142, + "step": 87 + }, + { + "epoch": 0.021250905578362716, + "grad_norm": 0.35853254795074463, + "learning_rate": 7.999970032872057e-05, + "loss": 1.727, + "step": 88 + }, + { + "epoch": 0.021492393141753198, + "grad_norm": 0.37769579887390137, + "learning_rate": 7.99995684735947e-05, + "loss": 1.8811, + "step": 89 + }, + { + "epoch": 0.021733880705143684, + "grad_norm": 0.3953562378883362, + "learning_rate": 7.999941264499637e-05, + "loss": 1.8882, + "step": 90 + }, + { + "epoch": 0.02197536826853417, + "grad_norm": 0.3842523992061615, + "learning_rate": 7.999923284301897e-05, + "loss": 1.9009, + "step": 91 + }, + { + "epoch": 0.022216855831924655, + "grad_norm": 0.4005531072616577, + "learning_rate": 7.999902906777028e-05, + "loss": 2.0613, + "step": 92 + }, + { + "epoch": 0.02245834339531514, + "grad_norm": 0.37064820528030396, + "learning_rate": 7.999880131937242e-05, + "loss": 1.9517, + "step": 93 + }, + { + "epoch": 0.022699830958705626, + "grad_norm": 0.372097373008728, + "learning_rate": 7.999854959796187e-05, + "loss": 1.8402, + "step": 94 + }, + { + "epoch": 0.02294131852209611, + "grad_norm": 0.34422364830970764, + "learning_rate": 7.999827390368954e-05, + "loss": 1.754, + "step": 95 + }, + { + "epoch": 0.023182806085486597, + "grad_norm": 0.4320511817932129, + "learning_rate": 7.999797423672062e-05, + "loss": 1.9835, + "step": 96 + }, + { + "epoch": 0.023424293648877083, + "grad_norm": 0.34041526913642883, + "learning_rate": 7.999765059723475e-05, + "loss": 1.593, + "step": 97 + }, + { + "epoch": 0.02366578121226757, + "grad_norm": 0.3749473989009857, + "learning_rate": 7.999730298542589e-05, + "loss": 1.9249, + "step": 98 + }, + { + "epoch": 0.023907268775658054, + "grad_norm": 0.37020304799079895, + "learning_rate": 7.999693140150238e-05, + "loss": 1.9598, + "step": 99 + }, + { + "epoch": 0.02414875633904854, + "grad_norm": 0.3638790249824524, + "learning_rate": 7.99965358456869e-05, + "loss": 1.7858, + "step": 100 + }, + { + "epoch": 0.024390243902439025, + "grad_norm": 0.35202088952064514, + "learning_rate": 7.999611631821657e-05, + "loss": 1.8988, + "step": 101 + }, + { + "epoch": 0.02463173146582951, + "grad_norm": 0.3286641538143158, + "learning_rate": 7.999567281934278e-05, + "loss": 1.73, + "step": 102 + }, + { + "epoch": 0.024873219029219996, + "grad_norm": 0.3850080668926239, + "learning_rate": 7.99952053493314e-05, + "loss": 1.8341, + "step": 103 + }, + { + "epoch": 0.025114706592610482, + "grad_norm": 0.354960560798645, + "learning_rate": 7.999471390846253e-05, + "loss": 2.0089, + "step": 104 + }, + { + "epoch": 0.025356194156000968, + "grad_norm": 0.3476881682872772, + "learning_rate": 7.999419849703078e-05, + "loss": 1.833, + "step": 105 + }, + { + "epoch": 0.02559768171939145, + "grad_norm": 0.35317471623420715, + "learning_rate": 7.999365911534503e-05, + "loss": 1.8344, + "step": 106 + }, + { + "epoch": 0.025839169282781935, + "grad_norm": 0.3764777183532715, + "learning_rate": 7.999309576372855e-05, + "loss": 1.9944, + "step": 107 + }, + { + "epoch": 0.02608065684617242, + "grad_norm": 0.3360855281352997, + "learning_rate": 7.999250844251898e-05, + "loss": 1.7526, + "step": 108 + }, + { + "epoch": 0.026322144409562907, + "grad_norm": 0.37262898683547974, + "learning_rate": 7.999189715206832e-05, + "loss": 1.7409, + "step": 109 + }, + { + "epoch": 0.026563631972953392, + "grad_norm": 0.34567996859550476, + "learning_rate": 7.999126189274298e-05, + "loss": 1.76, + "step": 110 + }, + { + "epoch": 0.026805119536343878, + "grad_norm": 0.37824591994285583, + "learning_rate": 7.999060266492366e-05, + "loss": 1.9955, + "step": 111 + }, + { + "epoch": 0.027046607099734363, + "grad_norm": 0.3456074297428131, + "learning_rate": 7.998991946900549e-05, + "loss": 1.6786, + "step": 112 + }, + { + "epoch": 0.02728809466312485, + "grad_norm": 0.40303823351860046, + "learning_rate": 7.998921230539792e-05, + "loss": 2.009, + "step": 113 + }, + { + "epoch": 0.027529582226515335, + "grad_norm": 0.37486642599105835, + "learning_rate": 7.998848117452479e-05, + "loss": 2.0262, + "step": 114 + }, + { + "epoch": 0.02777106978990582, + "grad_norm": 0.35351452231407166, + "learning_rate": 7.998772607682431e-05, + "loss": 1.8546, + "step": 115 + }, + { + "epoch": 0.028012557353296306, + "grad_norm": 0.33875027298927307, + "learning_rate": 7.998694701274901e-05, + "loss": 1.766, + "step": 116 + }, + { + "epoch": 0.02825404491668679, + "grad_norm": 0.35830602049827576, + "learning_rate": 7.998614398276586e-05, + "loss": 1.6792, + "step": 117 + }, + { + "epoch": 0.028495532480077277, + "grad_norm": 0.33689743280410767, + "learning_rate": 7.998531698735611e-05, + "loss": 1.8919, + "step": 118 + }, + { + "epoch": 0.028737020043467763, + "grad_norm": 0.33229848742485046, + "learning_rate": 7.998446602701544e-05, + "loss": 1.8482, + "step": 119 + }, + { + "epoch": 0.028978507606858248, + "grad_norm": 0.3552752733230591, + "learning_rate": 7.998359110225386e-05, + "loss": 1.8519, + "step": 120 + }, + { + "epoch": 0.029219995170248734, + "grad_norm": 0.3789513111114502, + "learning_rate": 7.998269221359575e-05, + "loss": 1.7455, + "step": 121 + }, + { + "epoch": 0.029461482733639216, + "grad_norm": 0.32534146308898926, + "learning_rate": 7.998176936157986e-05, + "loss": 1.7738, + "step": 122 + }, + { + "epoch": 0.0297029702970297, + "grad_norm": 0.37436211109161377, + "learning_rate": 7.998082254675929e-05, + "loss": 1.8552, + "step": 123 + }, + { + "epoch": 0.029944457860420187, + "grad_norm": 0.3442078232765198, + "learning_rate": 7.99798517697015e-05, + "loss": 1.7527, + "step": 124 + }, + { + "epoch": 0.030185945423810673, + "grad_norm": 0.36838826537132263, + "learning_rate": 7.997885703098833e-05, + "loss": 1.8089, + "step": 125 + }, + { + "epoch": 0.03042743298720116, + "grad_norm": 0.3229195475578308, + "learning_rate": 7.997783833121595e-05, + "loss": 1.7343, + "step": 126 + }, + { + "epoch": 0.030668920550591644, + "grad_norm": 0.35546913743019104, + "learning_rate": 7.997679567099495e-05, + "loss": 1.8091, + "step": 127 + }, + { + "epoch": 0.03091040811398213, + "grad_norm": 0.3430229425430298, + "learning_rate": 7.99757290509502e-05, + "loss": 1.731, + "step": 128 + }, + { + "epoch": 0.031151895677372615, + "grad_norm": 0.34878894686698914, + "learning_rate": 7.997463847172099e-05, + "loss": 1.8177, + "step": 129 + }, + { + "epoch": 0.0313933832407631, + "grad_norm": 0.3356412649154663, + "learning_rate": 7.997352393396094e-05, + "loss": 1.8495, + "step": 130 + }, + { + "epoch": 0.031634870804153586, + "grad_norm": 0.3388964533805847, + "learning_rate": 7.997238543833807e-05, + "loss": 1.7708, + "step": 131 + }, + { + "epoch": 0.03187635836754407, + "grad_norm": 0.3642221689224243, + "learning_rate": 7.99712229855347e-05, + "loss": 1.8336, + "step": 132 + }, + { + "epoch": 0.03211784593093456, + "grad_norm": 0.3364923298358917, + "learning_rate": 7.997003657624755e-05, + "loss": 1.7808, + "step": 133 + }, + { + "epoch": 0.03235933349432504, + "grad_norm": 0.35074931383132935, + "learning_rate": 7.996882621118769e-05, + "loss": 1.8519, + "step": 134 + }, + { + "epoch": 0.03260082105771553, + "grad_norm": 0.3484658896923065, + "learning_rate": 7.996759189108053e-05, + "loss": 1.8158, + "step": 135 + }, + { + "epoch": 0.032842308621106014, + "grad_norm": 0.32097330689430237, + "learning_rate": 7.996633361666587e-05, + "loss": 1.7388, + "step": 136 + }, + { + "epoch": 0.0330837961844965, + "grad_norm": 0.3958728611469269, + "learning_rate": 7.996505138869783e-05, + "loss": 1.9125, + "step": 137 + }, + { + "epoch": 0.033325283747886986, + "grad_norm": 0.3487996757030487, + "learning_rate": 7.996374520794492e-05, + "loss": 1.9042, + "step": 138 + }, + { + "epoch": 0.03356677131127747, + "grad_norm": 0.38680174946784973, + "learning_rate": 7.996241507518998e-05, + "loss": 1.9944, + "step": 139 + }, + { + "epoch": 0.03380825887466796, + "grad_norm": 0.32666078209877014, + "learning_rate": 7.996106099123022e-05, + "loss": 1.6428, + "step": 140 + }, + { + "epoch": 0.03404974643805844, + "grad_norm": 0.3395536541938782, + "learning_rate": 7.995968295687719e-05, + "loss": 1.8936, + "step": 141 + }, + { + "epoch": 0.03429123400144893, + "grad_norm": 0.3326514661312103, + "learning_rate": 7.995828097295685e-05, + "loss": 1.7893, + "step": 142 + }, + { + "epoch": 0.034532721564839414, + "grad_norm": 0.35848790407180786, + "learning_rate": 7.995685504030941e-05, + "loss": 1.9426, + "step": 143 + }, + { + "epoch": 0.0347742091282299, + "grad_norm": 0.3663111925125122, + "learning_rate": 7.995540515978952e-05, + "loss": 1.8951, + "step": 144 + }, + { + "epoch": 0.035015696691620385, + "grad_norm": 0.33936575055122375, + "learning_rate": 7.995393133226616e-05, + "loss": 1.8215, + "step": 145 + }, + { + "epoch": 0.035257184255010864, + "grad_norm": 0.33017027378082275, + "learning_rate": 7.995243355862266e-05, + "loss": 1.8301, + "step": 146 + }, + { + "epoch": 0.03549867181840135, + "grad_norm": 0.33415642380714417, + "learning_rate": 7.99509118397567e-05, + "loss": 1.8482, + "step": 147 + }, + { + "epoch": 0.035740159381791835, + "grad_norm": 0.35916557908058167, + "learning_rate": 7.99493661765803e-05, + "loss": 1.8992, + "step": 148 + }, + { + "epoch": 0.03598164694518232, + "grad_norm": 0.3150824308395386, + "learning_rate": 7.994779657001984e-05, + "loss": 1.7173, + "step": 149 + }, + { + "epoch": 0.036223134508572806, + "grad_norm": 0.35707587003707886, + "learning_rate": 7.994620302101607e-05, + "loss": 2.0529, + "step": 150 + }, + { + "epoch": 0.03646462207196329, + "grad_norm": 0.30455395579338074, + "learning_rate": 7.994458553052406e-05, + "loss": 1.5871, + "step": 151 + }, + { + "epoch": 0.03670610963535378, + "grad_norm": 0.3313930034637451, + "learning_rate": 7.994294409951326e-05, + "loss": 1.7102, + "step": 152 + }, + { + "epoch": 0.03694759719874426, + "grad_norm": 0.3556051254272461, + "learning_rate": 7.994127872896744e-05, + "loss": 1.9564, + "step": 153 + }, + { + "epoch": 0.03718908476213475, + "grad_norm": 0.39041200280189514, + "learning_rate": 7.993958941988472e-05, + "loss": 2.0505, + "step": 154 + }, + { + "epoch": 0.037430572325525234, + "grad_norm": 0.35395804047584534, + "learning_rate": 7.993787617327758e-05, + "loss": 1.9035, + "step": 155 + }, + { + "epoch": 0.03767205988891572, + "grad_norm": 0.32132115960121155, + "learning_rate": 7.993613899017286e-05, + "loss": 1.8414, + "step": 156 + }, + { + "epoch": 0.037913547452306205, + "grad_norm": 0.32500675320625305, + "learning_rate": 7.99343778716117e-05, + "loss": 1.4969, + "step": 157 + }, + { + "epoch": 0.03815503501569669, + "grad_norm": 0.32838916778564453, + "learning_rate": 7.993259281864964e-05, + "loss": 1.7858, + "step": 158 + }, + { + "epoch": 0.038396522579087176, + "grad_norm": 0.3455624580383301, + "learning_rate": 7.993078383235653e-05, + "loss": 1.8199, + "step": 159 + }, + { + "epoch": 0.03863801014247766, + "grad_norm": 0.3421010375022888, + "learning_rate": 7.992895091381656e-05, + "loss": 1.8818, + "step": 160 + }, + { + "epoch": 0.03887949770586815, + "grad_norm": 0.360836386680603, + "learning_rate": 7.99270940641283e-05, + "loss": 1.9759, + "step": 161 + }, + { + "epoch": 0.03912098526925863, + "grad_norm": 0.32319512963294983, + "learning_rate": 7.992521328440463e-05, + "loss": 1.6659, + "step": 162 + }, + { + "epoch": 0.03936247283264912, + "grad_norm": 0.33566924929618835, + "learning_rate": 7.992330857577278e-05, + "loss": 1.7625, + "step": 163 + }, + { + "epoch": 0.039603960396039604, + "grad_norm": 0.3267430365085602, + "learning_rate": 7.992137993937434e-05, + "loss": 1.7359, + "step": 164 + }, + { + "epoch": 0.03984544795943009, + "grad_norm": 0.37398430705070496, + "learning_rate": 7.991942737636519e-05, + "loss": 2.0229, + "step": 165 + }, + { + "epoch": 0.040086935522820576, + "grad_norm": 0.3316766023635864, + "learning_rate": 7.991745088791563e-05, + "loss": 1.8624, + "step": 166 + }, + { + "epoch": 0.04032842308621106, + "grad_norm": 0.3504400849342346, + "learning_rate": 7.991545047521022e-05, + "loss": 2.0128, + "step": 167 + }, + { + "epoch": 0.04056991064960155, + "grad_norm": 0.3182665705680847, + "learning_rate": 7.991342613944791e-05, + "loss": 1.5942, + "step": 168 + }, + { + "epoch": 0.04081139821299203, + "grad_norm": 0.3529200851917267, + "learning_rate": 7.991137788184198e-05, + "loss": 1.9559, + "step": 169 + }, + { + "epoch": 0.04105288577638252, + "grad_norm": 0.35057875514030457, + "learning_rate": 7.990930570362002e-05, + "loss": 1.8836, + "step": 170 + }, + { + "epoch": 0.041294373339773004, + "grad_norm": 0.3297763764858246, + "learning_rate": 7.990720960602398e-05, + "loss": 1.8221, + "step": 171 + }, + { + "epoch": 0.04153586090316349, + "grad_norm": 0.3292389512062073, + "learning_rate": 7.990508959031015e-05, + "loss": 1.7315, + "step": 172 + }, + { + "epoch": 0.041777348466553975, + "grad_norm": 0.3380139172077179, + "learning_rate": 7.990294565774916e-05, + "loss": 1.7487, + "step": 173 + }, + { + "epoch": 0.04201883602994446, + "grad_norm": 0.3513992130756378, + "learning_rate": 7.990077780962593e-05, + "loss": 1.7758, + "step": 174 + }, + { + "epoch": 0.042260323593334946, + "grad_norm": 0.3371720612049103, + "learning_rate": 7.989858604723976e-05, + "loss": 1.7694, + "step": 175 + }, + { + "epoch": 0.04250181115672543, + "grad_norm": 0.3474743366241455, + "learning_rate": 7.989637037190427e-05, + "loss": 1.8237, + "step": 176 + }, + { + "epoch": 0.04274329872011592, + "grad_norm": 0.3570946455001831, + "learning_rate": 7.989413078494742e-05, + "loss": 1.852, + "step": 177 + }, + { + "epoch": 0.042984786283506396, + "grad_norm": 0.4105489253997803, + "learning_rate": 7.989186728771147e-05, + "loss": 2.0145, + "step": 178 + }, + { + "epoch": 0.04322627384689688, + "grad_norm": 0.31396129727363586, + "learning_rate": 7.988957988155305e-05, + "loss": 1.699, + "step": 179 + }, + { + "epoch": 0.04346776141028737, + "grad_norm": 0.33446812629699707, + "learning_rate": 7.98872685678431e-05, + "loss": 1.8951, + "step": 180 + }, + { + "epoch": 0.04370924897367785, + "grad_norm": 0.3372074067592621, + "learning_rate": 7.988493334796688e-05, + "loss": 1.751, + "step": 181 + }, + { + "epoch": 0.04395073653706834, + "grad_norm": 0.3188993036746979, + "learning_rate": 7.988257422332398e-05, + "loss": 1.6716, + "step": 182 + }, + { + "epoch": 0.044192224100458824, + "grad_norm": 0.32916897535324097, + "learning_rate": 7.988019119532834e-05, + "loss": 1.7562, + "step": 183 + }, + { + "epoch": 0.04443371166384931, + "grad_norm": 0.5125882029533386, + "learning_rate": 7.987778426540821e-05, + "loss": 2.423, + "step": 184 + }, + { + "epoch": 0.044675199227239795, + "grad_norm": 0.34698373079299927, + "learning_rate": 7.987535343500619e-05, + "loss": 1.8062, + "step": 185 + }, + { + "epoch": 0.04491668679063028, + "grad_norm": 0.349882036447525, + "learning_rate": 7.987289870557914e-05, + "loss": 1.9638, + "step": 186 + }, + { + "epoch": 0.045158174354020766, + "grad_norm": 0.34001484513282776, + "learning_rate": 7.98704200785983e-05, + "loss": 1.8865, + "step": 187 + }, + { + "epoch": 0.04539966191741125, + "grad_norm": 0.34518545866012573, + "learning_rate": 7.986791755554923e-05, + "loss": 1.7105, + "step": 188 + }, + { + "epoch": 0.04564114948080174, + "grad_norm": 0.3248199224472046, + "learning_rate": 7.986539113793179e-05, + "loss": 1.8116, + "step": 189 + }, + { + "epoch": 0.04588263704419222, + "grad_norm": 0.36076945066452026, + "learning_rate": 7.986284082726017e-05, + "loss": 1.8027, + "step": 190 + }, + { + "epoch": 0.04612412460758271, + "grad_norm": 0.34199753403663635, + "learning_rate": 7.98602666250629e-05, + "loss": 1.815, + "step": 191 + }, + { + "epoch": 0.046365612170973194, + "grad_norm": 0.35182511806488037, + "learning_rate": 7.985766853288278e-05, + "loss": 1.8876, + "step": 192 + }, + { + "epoch": 0.04660709973436368, + "grad_norm": 0.31644105911254883, + "learning_rate": 7.9855046552277e-05, + "loss": 1.807, + "step": 193 + }, + { + "epoch": 0.046848587297754166, + "grad_norm": 0.34520867466926575, + "learning_rate": 7.985240068481698e-05, + "loss": 1.8446, + "step": 194 + }, + { + "epoch": 0.04709007486114465, + "grad_norm": 0.33563631772994995, + "learning_rate": 7.984973093208852e-05, + "loss": 1.8509, + "step": 195 + }, + { + "epoch": 0.04733156242453514, + "grad_norm": 0.3410038352012634, + "learning_rate": 7.984703729569175e-05, + "loss": 2.0203, + "step": 196 + }, + { + "epoch": 0.04757304998792562, + "grad_norm": 0.3287442624568939, + "learning_rate": 7.984431977724105e-05, + "loss": 1.6625, + "step": 197 + }, + { + "epoch": 0.04781453755131611, + "grad_norm": 0.3447628915309906, + "learning_rate": 7.984157837836515e-05, + "loss": 2.0291, + "step": 198 + }, + { + "epoch": 0.048056025114706594, + "grad_norm": 0.31992051005363464, + "learning_rate": 7.983881310070709e-05, + "loss": 1.682, + "step": 199 + }, + { + "epoch": 0.04829751267809708, + "grad_norm": 0.3539101779460907, + "learning_rate": 7.983602394592422e-05, + "loss": 2.0146, + "step": 200 + }, + { + "epoch": 0.048539000241487565, + "grad_norm": 0.3836063742637634, + "learning_rate": 7.983321091568821e-05, + "loss": 1.6322, + "step": 201 + }, + { + "epoch": 0.04878048780487805, + "grad_norm": 0.3384498953819275, + "learning_rate": 7.983037401168503e-05, + "loss": 1.8082, + "step": 202 + }, + { + "epoch": 0.049021975368268536, + "grad_norm": 0.3204689621925354, + "learning_rate": 7.982751323561493e-05, + "loss": 1.7478, + "step": 203 + }, + { + "epoch": 0.04926346293165902, + "grad_norm": 0.363129585981369, + "learning_rate": 7.982462858919255e-05, + "loss": 1.8098, + "step": 204 + }, + { + "epoch": 0.04950495049504951, + "grad_norm": 0.3273480534553528, + "learning_rate": 7.982172007414675e-05, + "loss": 1.8649, + "step": 205 + }, + { + "epoch": 0.04974643805843999, + "grad_norm": 0.32456788420677185, + "learning_rate": 7.981878769222072e-05, + "loss": 1.773, + "step": 206 + }, + { + "epoch": 0.04998792562183048, + "grad_norm": 0.34132328629493713, + "learning_rate": 7.981583144517198e-05, + "loss": 1.7702, + "step": 207 + }, + { + "epoch": 0.050229413185220964, + "grad_norm": 0.3215339779853821, + "learning_rate": 7.981285133477233e-05, + "loss": 1.6318, + "step": 208 + }, + { + "epoch": 0.05047090074861145, + "grad_norm": 0.3282195031642914, + "learning_rate": 7.980984736280789e-05, + "loss": 1.7331, + "step": 209 + }, + { + "epoch": 0.050712388312001935, + "grad_norm": 0.3406447172164917, + "learning_rate": 7.980681953107905e-05, + "loss": 1.837, + "step": 210 + }, + { + "epoch": 0.050953875875392414, + "grad_norm": 0.3377143442630768, + "learning_rate": 7.980376784140055e-05, + "loss": 1.8457, + "step": 211 + }, + { + "epoch": 0.0511953634387829, + "grad_norm": 0.3229312002658844, + "learning_rate": 7.980069229560137e-05, + "loss": 1.7076, + "step": 212 + }, + { + "epoch": 0.051436851002173385, + "grad_norm": 0.3151211142539978, + "learning_rate": 7.979759289552484e-05, + "loss": 1.7162, + "step": 213 + }, + { + "epoch": 0.05167833856556387, + "grad_norm": 0.3200671672821045, + "learning_rate": 7.979446964302856e-05, + "loss": 1.6625, + "step": 214 + }, + { + "epoch": 0.051919826128954356, + "grad_norm": 0.33359915018081665, + "learning_rate": 7.979132253998442e-05, + "loss": 1.9556, + "step": 215 + }, + { + "epoch": 0.05216131369234484, + "grad_norm": 0.3339202404022217, + "learning_rate": 7.978815158827862e-05, + "loss": 1.7216, + "step": 216 + }, + { + "epoch": 0.05240280125573533, + "grad_norm": 0.3254282772541046, + "learning_rate": 7.978495678981165e-05, + "loss": 1.7696, + "step": 217 + }, + { + "epoch": 0.05264428881912581, + "grad_norm": 0.3372923731803894, + "learning_rate": 7.978173814649828e-05, + "loss": 1.837, + "step": 218 + }, + { + "epoch": 0.0528857763825163, + "grad_norm": 0.32411250472068787, + "learning_rate": 7.977849566026761e-05, + "loss": 1.8982, + "step": 219 + }, + { + "epoch": 0.053127263945906784, + "grad_norm": 0.31956303119659424, + "learning_rate": 7.977522933306298e-05, + "loss": 1.884, + "step": 220 + }, + { + "epoch": 0.05336875150929727, + "grad_norm": 0.3496444821357727, + "learning_rate": 7.977193916684204e-05, + "loss": 1.9066, + "step": 221 + }, + { + "epoch": 0.053610239072687756, + "grad_norm": 0.29580965638160706, + "learning_rate": 7.976862516357675e-05, + "loss": 1.6975, + "step": 222 + }, + { + "epoch": 0.05385172663607824, + "grad_norm": 0.30984580516815186, + "learning_rate": 7.976528732525332e-05, + "loss": 1.8103, + "step": 223 + }, + { + "epoch": 0.05409321419946873, + "grad_norm": 0.33822616934776306, + "learning_rate": 7.976192565387225e-05, + "loss": 1.8781, + "step": 224 + }, + { + "epoch": 0.05433470176285921, + "grad_norm": 0.32609352469444275, + "learning_rate": 7.975854015144834e-05, + "loss": 1.8569, + "step": 225 + }, + { + "epoch": 0.0545761893262497, + "grad_norm": 0.33209675550460815, + "learning_rate": 7.975513082001069e-05, + "loss": 1.9403, + "step": 226 + }, + { + "epoch": 0.054817676889640184, + "grad_norm": 0.3058185577392578, + "learning_rate": 7.975169766160265e-05, + "loss": 1.6912, + "step": 227 + }, + { + "epoch": 0.05505916445303067, + "grad_norm": 0.35320064425468445, + "learning_rate": 7.974824067828184e-05, + "loss": 1.9151, + "step": 228 + }, + { + "epoch": 0.055300652016421155, + "grad_norm": 0.336840957403183, + "learning_rate": 7.97447598721202e-05, + "loss": 1.8253, + "step": 229 + }, + { + "epoch": 0.05554213957981164, + "grad_norm": 0.320771723985672, + "learning_rate": 7.974125524520393e-05, + "loss": 1.7369, + "step": 230 + }, + { + "epoch": 0.055783627143202126, + "grad_norm": 0.35173293948173523, + "learning_rate": 7.973772679963348e-05, + "loss": 2.0621, + "step": 231 + }, + { + "epoch": 0.05602511470659261, + "grad_norm": 0.3257352113723755, + "learning_rate": 7.973417453752364e-05, + "loss": 1.8283, + "step": 232 + }, + { + "epoch": 0.0562666022699831, + "grad_norm": 0.32054367661476135, + "learning_rate": 7.97305984610034e-05, + "loss": 1.8359, + "step": 233 + }, + { + "epoch": 0.05650808983337358, + "grad_norm": 0.3325577974319458, + "learning_rate": 7.972699857221607e-05, + "loss": 1.9108, + "step": 234 + }, + { + "epoch": 0.05674957739676407, + "grad_norm": 0.3135945796966553, + "learning_rate": 7.972337487331923e-05, + "loss": 1.6775, + "step": 235 + }, + { + "epoch": 0.056991064960154554, + "grad_norm": 0.30711257457733154, + "learning_rate": 7.97197273664847e-05, + "loss": 1.7344, + "step": 236 + }, + { + "epoch": 0.05723255252354504, + "grad_norm": 0.3135779798030853, + "learning_rate": 7.971605605389858e-05, + "loss": 1.84, + "step": 237 + }, + { + "epoch": 0.057474040086935525, + "grad_norm": 0.29817330837249756, + "learning_rate": 7.971236093776129e-05, + "loss": 1.7427, + "step": 238 + }, + { + "epoch": 0.05771552765032601, + "grad_norm": 0.3177940845489502, + "learning_rate": 7.970864202028743e-05, + "loss": 1.7154, + "step": 239 + }, + { + "epoch": 0.057957015213716497, + "grad_norm": 0.3320569396018982, + "learning_rate": 7.970489930370593e-05, + "loss": 1.8771, + "step": 240 + }, + { + "epoch": 0.05819850277710698, + "grad_norm": 0.32810327410697937, + "learning_rate": 7.970113279025996e-05, + "loss": 1.8912, + "step": 241 + }, + { + "epoch": 0.05843999034049747, + "grad_norm": 0.3361932635307312, + "learning_rate": 7.969734248220695e-05, + "loss": 1.9356, + "step": 242 + }, + { + "epoch": 0.058681477903887946, + "grad_norm": 0.34913378953933716, + "learning_rate": 7.969352838181859e-05, + "loss": 1.8365, + "step": 243 + }, + { + "epoch": 0.05892296546727843, + "grad_norm": 0.3116905689239502, + "learning_rate": 7.968969049138086e-05, + "loss": 1.7415, + "step": 244 + }, + { + "epoch": 0.05916445303066892, + "grad_norm": 0.2941270172595978, + "learning_rate": 7.968582881319393e-05, + "loss": 1.6864, + "step": 245 + }, + { + "epoch": 0.0594059405940594, + "grad_norm": 0.32845309376716614, + "learning_rate": 7.968194334957231e-05, + "loss": 1.8652, + "step": 246 + }, + { + "epoch": 0.05964742815744989, + "grad_norm": 0.34726226329803467, + "learning_rate": 7.967803410284471e-05, + "loss": 1.7913, + "step": 247 + }, + { + "epoch": 0.059888915720840374, + "grad_norm": 0.3105839490890503, + "learning_rate": 7.967410107535414e-05, + "loss": 1.625, + "step": 248 + }, + { + "epoch": 0.06013040328423086, + "grad_norm": 0.3217976987361908, + "learning_rate": 7.967014426945778e-05, + "loss": 1.7158, + "step": 249 + }, + { + "epoch": 0.060371890847621346, + "grad_norm": 0.31204503774642944, + "learning_rate": 7.966616368752715e-05, + "loss": 1.7494, + "step": 250 + }, + { + "epoch": 0.06061337841101183, + "grad_norm": 0.3445545732975006, + "learning_rate": 7.966215933194797e-05, + "loss": 1.7762, + "step": 251 + }, + { + "epoch": 0.06085486597440232, + "grad_norm": 0.3073709011077881, + "learning_rate": 7.965813120512024e-05, + "loss": 1.5378, + "step": 252 + }, + { + "epoch": 0.0610963535377928, + "grad_norm": 0.3341065049171448, + "learning_rate": 7.965407930945818e-05, + "loss": 1.7331, + "step": 253 + }, + { + "epoch": 0.06133784110118329, + "grad_norm": 0.3325900137424469, + "learning_rate": 7.965000364739028e-05, + "loss": 1.8412, + "step": 254 + }, + { + "epoch": 0.061579328664573774, + "grad_norm": 0.3155021667480469, + "learning_rate": 7.964590422135923e-05, + "loss": 1.7861, + "step": 255 + }, + { + "epoch": 0.06182081622796426, + "grad_norm": 0.34470134973526, + "learning_rate": 7.964178103382201e-05, + "loss": 1.8445, + "step": 256 + }, + { + "epoch": 0.062062303791354745, + "grad_norm": 0.3327556848526001, + "learning_rate": 7.963763408724984e-05, + "loss": 1.7702, + "step": 257 + }, + { + "epoch": 0.06230379135474523, + "grad_norm": 0.3155532479286194, + "learning_rate": 7.963346338412816e-05, + "loss": 1.7478, + "step": 258 + }, + { + "epoch": 0.06254527891813572, + "grad_norm": 0.32543814182281494, + "learning_rate": 7.962926892695664e-05, + "loss": 1.8435, + "step": 259 + }, + { + "epoch": 0.0627867664815262, + "grad_norm": 0.3015563189983368, + "learning_rate": 7.962505071824919e-05, + "loss": 1.7412, + "step": 260 + }, + { + "epoch": 0.06302825404491669, + "grad_norm": 0.2858722507953644, + "learning_rate": 7.9620808760534e-05, + "loss": 1.5965, + "step": 261 + }, + { + "epoch": 0.06326974160830717, + "grad_norm": 0.309163361787796, + "learning_rate": 7.961654305635342e-05, + "loss": 1.7705, + "step": 262 + }, + { + "epoch": 0.06351122917169766, + "grad_norm": 0.31264615058898926, + "learning_rate": 7.96122536082641e-05, + "loss": 1.786, + "step": 263 + }, + { + "epoch": 0.06375271673508814, + "grad_norm": 0.31055596470832825, + "learning_rate": 7.960794041883688e-05, + "loss": 1.6784, + "step": 264 + }, + { + "epoch": 0.06399420429847863, + "grad_norm": 0.31669291853904724, + "learning_rate": 7.960360349065684e-05, + "loss": 1.7871, + "step": 265 + }, + { + "epoch": 0.06423569186186912, + "grad_norm": 0.3654109239578247, + "learning_rate": 7.95992428263233e-05, + "loss": 2.057, + "step": 266 + }, + { + "epoch": 0.0644771794252596, + "grad_norm": 0.2968808114528656, + "learning_rate": 7.959485842844977e-05, + "loss": 1.7963, + "step": 267 + }, + { + "epoch": 0.06471866698865009, + "grad_norm": 0.31135043501853943, + "learning_rate": 7.959045029966403e-05, + "loss": 1.7483, + "step": 268 + }, + { + "epoch": 0.06496015455204057, + "grad_norm": 0.30263540148735046, + "learning_rate": 7.958601844260807e-05, + "loss": 1.5378, + "step": 269 + }, + { + "epoch": 0.06520164211543106, + "grad_norm": 0.327248215675354, + "learning_rate": 7.958156285993807e-05, + "loss": 1.8316, + "step": 270 + }, + { + "epoch": 0.06544312967882154, + "grad_norm": 0.3525853455066681, + "learning_rate": 7.957708355432447e-05, + "loss": 2.1472, + "step": 271 + }, + { + "epoch": 0.06568461724221203, + "grad_norm": 0.3097147047519684, + "learning_rate": 7.957258052845189e-05, + "loss": 1.7649, + "step": 272 + }, + { + "epoch": 0.06592610480560251, + "grad_norm": 0.3462578058242798, + "learning_rate": 7.956805378501923e-05, + "loss": 1.926, + "step": 273 + }, + { + "epoch": 0.066167592368993, + "grad_norm": 0.32972514629364014, + "learning_rate": 7.956350332673954e-05, + "loss": 1.8855, + "step": 274 + }, + { + "epoch": 0.06640907993238349, + "grad_norm": 0.3470173478126526, + "learning_rate": 7.955892915634008e-05, + "loss": 1.8816, + "step": 275 + }, + { + "epoch": 0.06665056749577397, + "grad_norm": 0.3056792616844177, + "learning_rate": 7.955433127656239e-05, + "loss": 1.7791, + "step": 276 + }, + { + "epoch": 0.06689205505916446, + "grad_norm": 0.3143889605998993, + "learning_rate": 7.954970969016217e-05, + "loss": 1.7267, + "step": 277 + }, + { + "epoch": 0.06713354262255494, + "grad_norm": 0.3461814224720001, + "learning_rate": 7.954506439990931e-05, + "loss": 1.8244, + "step": 278 + }, + { + "epoch": 0.06737503018594543, + "grad_norm": 0.34658658504486084, + "learning_rate": 7.954039540858795e-05, + "loss": 1.888, + "step": 279 + }, + { + "epoch": 0.06761651774933591, + "grad_norm": 0.323635995388031, + "learning_rate": 7.953570271899644e-05, + "loss": 1.8313, + "step": 280 + }, + { + "epoch": 0.0678580053127264, + "grad_norm": 0.32019785046577454, + "learning_rate": 7.953098633394728e-05, + "loss": 1.7461, + "step": 281 + }, + { + "epoch": 0.06809949287611688, + "grad_norm": 0.3277647793292999, + "learning_rate": 7.95262462562672e-05, + "loss": 1.7611, + "step": 282 + }, + { + "epoch": 0.06834098043950737, + "grad_norm": 0.31137654185295105, + "learning_rate": 7.952148248879718e-05, + "loss": 1.7579, + "step": 283 + }, + { + "epoch": 0.06858246800289786, + "grad_norm": 0.3207230269908905, + "learning_rate": 7.951669503439232e-05, + "loss": 1.7806, + "step": 284 + }, + { + "epoch": 0.06882395556628834, + "grad_norm": 0.31498652696609497, + "learning_rate": 7.951188389592193e-05, + "loss": 1.8651, + "step": 285 + }, + { + "epoch": 0.06906544312967883, + "grad_norm": 0.32896509766578674, + "learning_rate": 7.950704907626956e-05, + "loss": 1.7896, + "step": 286 + }, + { + "epoch": 0.06930693069306931, + "grad_norm": 0.3297777473926544, + "learning_rate": 7.950219057833293e-05, + "loss": 1.87, + "step": 287 + }, + { + "epoch": 0.0695484182564598, + "grad_norm": 0.32095208764076233, + "learning_rate": 7.949730840502392e-05, + "loss": 1.8186, + "step": 288 + }, + { + "epoch": 0.06978990581985028, + "grad_norm": 0.3138609230518341, + "learning_rate": 7.949240255926867e-05, + "loss": 1.7104, + "step": 289 + }, + { + "epoch": 0.07003139338324077, + "grad_norm": 0.30844905972480774, + "learning_rate": 7.948747304400743e-05, + "loss": 1.7806, + "step": 290 + }, + { + "epoch": 0.07027288094663126, + "grad_norm": 0.3149530589580536, + "learning_rate": 7.948251986219468e-05, + "loss": 1.8081, + "step": 291 + }, + { + "epoch": 0.07051436851002173, + "grad_norm": 0.3314594328403473, + "learning_rate": 7.947754301679909e-05, + "loss": 1.8093, + "step": 292 + }, + { + "epoch": 0.07075585607341221, + "grad_norm": 0.32003554701805115, + "learning_rate": 7.947254251080348e-05, + "loss": 1.8002, + "step": 293 + }, + { + "epoch": 0.0709973436368027, + "grad_norm": 0.3048597574234009, + "learning_rate": 7.946751834720488e-05, + "loss": 1.8229, + "step": 294 + }, + { + "epoch": 0.07123883120019318, + "grad_norm": 0.3036291301250458, + "learning_rate": 7.946247052901449e-05, + "loss": 1.8471, + "step": 295 + }, + { + "epoch": 0.07148031876358367, + "grad_norm": 0.3238702118396759, + "learning_rate": 7.945739905925768e-05, + "loss": 1.7944, + "step": 296 + }, + { + "epoch": 0.07172180632697416, + "grad_norm": 0.31713131070137024, + "learning_rate": 7.945230394097399e-05, + "loss": 1.8629, + "step": 297 + }, + { + "epoch": 0.07196329389036464, + "grad_norm": 0.33282196521759033, + "learning_rate": 7.944718517721719e-05, + "loss": 1.8295, + "step": 298 + }, + { + "epoch": 0.07220478145375513, + "grad_norm": 0.3299509584903717, + "learning_rate": 7.944204277105512e-05, + "loss": 1.8887, + "step": 299 + }, + { + "epoch": 0.07244626901714561, + "grad_norm": 0.32252463698387146, + "learning_rate": 7.943687672556989e-05, + "loss": 1.9744, + "step": 300 + }, + { + "epoch": 0.0726877565805361, + "grad_norm": 0.31342577934265137, + "learning_rate": 7.943168704385771e-05, + "loss": 1.8915, + "step": 301 + }, + { + "epoch": 0.07292924414392658, + "grad_norm": 0.31736376881599426, + "learning_rate": 7.942647372902898e-05, + "loss": 1.6628, + "step": 302 + }, + { + "epoch": 0.07317073170731707, + "grad_norm": 0.3148774206638336, + "learning_rate": 7.942123678420829e-05, + "loss": 1.9219, + "step": 303 + }, + { + "epoch": 0.07341221927070755, + "grad_norm": 0.31064704060554504, + "learning_rate": 7.941597621253434e-05, + "loss": 1.6907, + "step": 304 + }, + { + "epoch": 0.07365370683409804, + "grad_norm": 0.34153732657432556, + "learning_rate": 7.941069201716003e-05, + "loss": 1.8361, + "step": 305 + }, + { + "epoch": 0.07389519439748853, + "grad_norm": 0.3452036380767822, + "learning_rate": 7.94053842012524e-05, + "loss": 1.9958, + "step": 306 + }, + { + "epoch": 0.07413668196087901, + "grad_norm": 0.3184818625450134, + "learning_rate": 7.940005276799267e-05, + "loss": 1.8116, + "step": 307 + }, + { + "epoch": 0.0743781695242695, + "grad_norm": 0.3384685516357422, + "learning_rate": 7.93946977205762e-05, + "loss": 1.8963, + "step": 308 + }, + { + "epoch": 0.07461965708765998, + "grad_norm": 0.31626102328300476, + "learning_rate": 7.938931906221246e-05, + "loss": 1.7312, + "step": 309 + }, + { + "epoch": 0.07486114465105047, + "grad_norm": 0.3364972472190857, + "learning_rate": 7.938391679612515e-05, + "loss": 1.9645, + "step": 310 + }, + { + "epoch": 0.07510263221444095, + "grad_norm": 0.31800857186317444, + "learning_rate": 7.93784909255521e-05, + "loss": 1.873, + "step": 311 + }, + { + "epoch": 0.07534411977783144, + "grad_norm": 0.2949671745300293, + "learning_rate": 7.937304145374522e-05, + "loss": 1.7794, + "step": 312 + }, + { + "epoch": 0.07558560734122192, + "grad_norm": 0.3183116912841797, + "learning_rate": 7.936756838397064e-05, + "loss": 1.9644, + "step": 313 + }, + { + "epoch": 0.07582709490461241, + "grad_norm": 0.32806089520454407, + "learning_rate": 7.93620717195086e-05, + "loss": 1.8161, + "step": 314 + }, + { + "epoch": 0.0760685824680029, + "grad_norm": 0.3097519874572754, + "learning_rate": 7.935655146365353e-05, + "loss": 1.8672, + "step": 315 + }, + { + "epoch": 0.07631007003139338, + "grad_norm": 0.3398526608943939, + "learning_rate": 7.935100761971388e-05, + "loss": 2.0628, + "step": 316 + }, + { + "epoch": 0.07655155759478387, + "grad_norm": 0.2980629503726959, + "learning_rate": 7.934544019101238e-05, + "loss": 1.7722, + "step": 317 + }, + { + "epoch": 0.07679304515817435, + "grad_norm": 0.33271175622940063, + "learning_rate": 7.93398491808858e-05, + "loss": 1.884, + "step": 318 + }, + { + "epoch": 0.07703453272156484, + "grad_norm": 0.3190302550792694, + "learning_rate": 7.933423459268509e-05, + "loss": 1.671, + "step": 319 + }, + { + "epoch": 0.07727602028495532, + "grad_norm": 0.309345006942749, + "learning_rate": 7.932859642977532e-05, + "loss": 1.7244, + "step": 320 + }, + { + "epoch": 0.07751750784834581, + "grad_norm": 0.3233974575996399, + "learning_rate": 7.932293469553566e-05, + "loss": 1.852, + "step": 321 + }, + { + "epoch": 0.0777589954117363, + "grad_norm": 0.301312118768692, + "learning_rate": 7.931724939335945e-05, + "loss": 1.7854, + "step": 322 + }, + { + "epoch": 0.07800048297512678, + "grad_norm": 0.33955588936805725, + "learning_rate": 7.931154052665413e-05, + "loss": 2.0226, + "step": 323 + }, + { + "epoch": 0.07824197053851727, + "grad_norm": 0.3273119330406189, + "learning_rate": 7.930580809884129e-05, + "loss": 1.8961, + "step": 324 + }, + { + "epoch": 0.07848345810190775, + "grad_norm": 0.30406197905540466, + "learning_rate": 7.930005211335659e-05, + "loss": 1.7842, + "step": 325 + }, + { + "epoch": 0.07872494566529824, + "grad_norm": 0.30615532398223877, + "learning_rate": 7.929427257364987e-05, + "loss": 1.6904, + "step": 326 + }, + { + "epoch": 0.07896643322868872, + "grad_norm": 0.30859431624412537, + "learning_rate": 7.928846948318504e-05, + "loss": 1.736, + "step": 327 + }, + { + "epoch": 0.07920792079207921, + "grad_norm": 0.3163640797138214, + "learning_rate": 7.928264284544015e-05, + "loss": 1.7944, + "step": 328 + }, + { + "epoch": 0.0794494083554697, + "grad_norm": 0.3048076629638672, + "learning_rate": 7.927679266390735e-05, + "loss": 1.8136, + "step": 329 + }, + { + "epoch": 0.07969089591886018, + "grad_norm": 0.30701613426208496, + "learning_rate": 7.927091894209293e-05, + "loss": 1.7733, + "step": 330 + }, + { + "epoch": 0.07993238348225067, + "grad_norm": 0.3330737054347992, + "learning_rate": 7.926502168351724e-05, + "loss": 1.7777, + "step": 331 + }, + { + "epoch": 0.08017387104564115, + "grad_norm": 0.3272298276424408, + "learning_rate": 7.925910089171478e-05, + "loss": 1.778, + "step": 332 + }, + { + "epoch": 0.08041535860903164, + "grad_norm": 0.3150383234024048, + "learning_rate": 7.925315657023412e-05, + "loss": 1.7796, + "step": 333 + }, + { + "epoch": 0.08065684617242212, + "grad_norm": 0.2980138659477234, + "learning_rate": 7.924718872263795e-05, + "loss": 1.6073, + "step": 334 + }, + { + "epoch": 0.08089833373581261, + "grad_norm": 0.4316518008708954, + "learning_rate": 7.924119735250307e-05, + "loss": 2.2031, + "step": 335 + }, + { + "epoch": 0.0811398212992031, + "grad_norm": 0.31057435274124146, + "learning_rate": 7.923518246342037e-05, + "loss": 1.6824, + "step": 336 + }, + { + "epoch": 0.08138130886259358, + "grad_norm": 0.3399696946144104, + "learning_rate": 7.922914405899482e-05, + "loss": 1.9264, + "step": 337 + }, + { + "epoch": 0.08162279642598406, + "grad_norm": 0.3247483968734741, + "learning_rate": 7.922308214284551e-05, + "loss": 1.8827, + "step": 338 + }, + { + "epoch": 0.08186428398937455, + "grad_norm": 0.31430065631866455, + "learning_rate": 7.921699671860561e-05, + "loss": 1.8006, + "step": 339 + }, + { + "epoch": 0.08210577155276504, + "grad_norm": 0.31440189480781555, + "learning_rate": 7.921088778992236e-05, + "loss": 1.8218, + "step": 340 + }, + { + "epoch": 0.08234725911615552, + "grad_norm": 0.29964831471443176, + "learning_rate": 7.920475536045711e-05, + "loss": 1.7306, + "step": 341 + }, + { + "epoch": 0.08258874667954601, + "grad_norm": 0.3098861575126648, + "learning_rate": 7.919859943388531e-05, + "loss": 1.8838, + "step": 342 + }, + { + "epoch": 0.08283023424293649, + "grad_norm": 0.3180960714817047, + "learning_rate": 7.919242001389645e-05, + "loss": 1.953, + "step": 343 + }, + { + "epoch": 0.08307172180632698, + "grad_norm": 0.31091493368148804, + "learning_rate": 7.918621710419414e-05, + "loss": 1.7183, + "step": 344 + }, + { + "epoch": 0.08331320936971746, + "grad_norm": 0.3297421932220459, + "learning_rate": 7.917999070849606e-05, + "loss": 1.966, + "step": 345 + }, + { + "epoch": 0.08355469693310795, + "grad_norm": 0.33071455359458923, + "learning_rate": 7.917374083053392e-05, + "loss": 1.8315, + "step": 346 + }, + { + "epoch": 0.08379618449649844, + "grad_norm": 0.31250739097595215, + "learning_rate": 7.916746747405358e-05, + "loss": 1.6587, + "step": 347 + }, + { + "epoch": 0.08403767205988892, + "grad_norm": 0.3179730176925659, + "learning_rate": 7.916117064281491e-05, + "loss": 1.9032, + "step": 348 + }, + { + "epoch": 0.0842791596232794, + "grad_norm": 0.3075062036514282, + "learning_rate": 7.915485034059191e-05, + "loss": 1.703, + "step": 349 + }, + { + "epoch": 0.08452064718666989, + "grad_norm": 0.3239034414291382, + "learning_rate": 7.914850657117255e-05, + "loss": 1.9085, + "step": 350 + }, + { + "epoch": 0.08476213475006038, + "grad_norm": 0.3100548982620239, + "learning_rate": 7.914213933835899e-05, + "loss": 1.91, + "step": 351 + }, + { + "epoch": 0.08500362231345086, + "grad_norm": 0.40014979243278503, + "learning_rate": 7.913574864596733e-05, + "loss": 1.7173, + "step": 352 + }, + { + "epoch": 0.08524510987684135, + "grad_norm": 0.3187917470932007, + "learning_rate": 7.912933449782784e-05, + "loss": 1.8536, + "step": 353 + }, + { + "epoch": 0.08548659744023183, + "grad_norm": 0.3200077712535858, + "learning_rate": 7.912289689778477e-05, + "loss": 1.8253, + "step": 354 + }, + { + "epoch": 0.08572808500362232, + "grad_norm": 0.2999676465988159, + "learning_rate": 7.911643584969644e-05, + "loss": 1.5448, + "step": 355 + }, + { + "epoch": 0.08596957256701279, + "grad_norm": 0.3196452558040619, + "learning_rate": 7.910995135743527e-05, + "loss": 1.7994, + "step": 356 + }, + { + "epoch": 0.08621106013040328, + "grad_norm": 0.32246023416519165, + "learning_rate": 7.910344342488767e-05, + "loss": 1.8654, + "step": 357 + }, + { + "epoch": 0.08645254769379376, + "grad_norm": 0.3140832185745239, + "learning_rate": 7.909691205595415e-05, + "loss": 1.7172, + "step": 358 + }, + { + "epoch": 0.08669403525718425, + "grad_norm": 0.3014783561229706, + "learning_rate": 7.909035725454922e-05, + "loss": 1.8307, + "step": 359 + }, + { + "epoch": 0.08693552282057473, + "grad_norm": 0.31697165966033936, + "learning_rate": 7.908377902460145e-05, + "loss": 1.8369, + "step": 360 + }, + { + "epoch": 0.08717701038396522, + "grad_norm": 0.34776023030281067, + "learning_rate": 7.907717737005347e-05, + "loss": 1.7673, + "step": 361 + }, + { + "epoch": 0.0874184979473557, + "grad_norm": 0.30561959743499756, + "learning_rate": 7.907055229486194e-05, + "loss": 1.7124, + "step": 362 + }, + { + "epoch": 0.08765998551074619, + "grad_norm": 0.31223785877227783, + "learning_rate": 7.906390380299757e-05, + "loss": 1.8257, + "step": 363 + }, + { + "epoch": 0.08790147307413668, + "grad_norm": 0.31563735008239746, + "learning_rate": 7.905723189844505e-05, + "loss": 1.6304, + "step": 364 + }, + { + "epoch": 0.08814296063752716, + "grad_norm": 0.3267379105091095, + "learning_rate": 7.905053658520317e-05, + "loss": 1.8192, + "step": 365 + }, + { + "epoch": 0.08838444820091765, + "grad_norm": 0.3055742084980011, + "learning_rate": 7.90438178672847e-05, + "loss": 1.7856, + "step": 366 + }, + { + "epoch": 0.08862593576430813, + "grad_norm": 0.3425109088420868, + "learning_rate": 7.90370757487165e-05, + "loss": 1.8594, + "step": 367 + }, + { + "epoch": 0.08886742332769862, + "grad_norm": 0.34041139483451843, + "learning_rate": 7.903031023353937e-05, + "loss": 1.8386, + "step": 368 + }, + { + "epoch": 0.0891089108910891, + "grad_norm": 0.3045822083950043, + "learning_rate": 7.902352132580818e-05, + "loss": 1.7817, + "step": 369 + }, + { + "epoch": 0.08935039845447959, + "grad_norm": 0.33832845091819763, + "learning_rate": 7.901670902959184e-05, + "loss": 1.891, + "step": 370 + }, + { + "epoch": 0.08959188601787008, + "grad_norm": 0.31787779927253723, + "learning_rate": 7.900987334897323e-05, + "loss": 1.8206, + "step": 371 + }, + { + "epoch": 0.08983337358126056, + "grad_norm": 0.3053485155105591, + "learning_rate": 7.900301428804929e-05, + "loss": 1.8119, + "step": 372 + }, + { + "epoch": 0.09007486114465105, + "grad_norm": 0.3189673125743866, + "learning_rate": 7.899613185093094e-05, + "loss": 1.7181, + "step": 373 + }, + { + "epoch": 0.09031634870804153, + "grad_norm": 0.330003947019577, + "learning_rate": 7.898922604174312e-05, + "loss": 1.7952, + "step": 374 + }, + { + "epoch": 0.09055783627143202, + "grad_norm": 0.32323578000068665, + "learning_rate": 7.89822968646248e-05, + "loss": 1.8331, + "step": 375 + }, + { + "epoch": 0.0907993238348225, + "grad_norm": 0.3234061896800995, + "learning_rate": 7.897534432372891e-05, + "loss": 1.8201, + "step": 376 + }, + { + "epoch": 0.09104081139821299, + "grad_norm": 0.3311329185962677, + "learning_rate": 7.896836842322241e-05, + "loss": 1.8964, + "step": 377 + }, + { + "epoch": 0.09128229896160348, + "grad_norm": 0.33288565278053284, + "learning_rate": 7.896136916728628e-05, + "loss": 1.7157, + "step": 378 + }, + { + "epoch": 0.09152378652499396, + "grad_norm": 0.2955407202243805, + "learning_rate": 7.895434656011546e-05, + "loss": 1.7627, + "step": 379 + }, + { + "epoch": 0.09176527408838445, + "grad_norm": 0.32634636759757996, + "learning_rate": 7.894730060591892e-05, + "loss": 1.9303, + "step": 380 + }, + { + "epoch": 0.09200676165177493, + "grad_norm": 0.3033986985683441, + "learning_rate": 7.894023130891958e-05, + "loss": 1.6711, + "step": 381 + }, + { + "epoch": 0.09224824921516542, + "grad_norm": 0.31847289204597473, + "learning_rate": 7.893313867335439e-05, + "loss": 1.7684, + "step": 382 + }, + { + "epoch": 0.0924897367785559, + "grad_norm": 0.3253558874130249, + "learning_rate": 7.892602270347427e-05, + "loss": 1.8255, + "step": 383 + }, + { + "epoch": 0.09273122434194639, + "grad_norm": 0.3166964054107666, + "learning_rate": 7.891888340354413e-05, + "loss": 1.7866, + "step": 384 + }, + { + "epoch": 0.09297271190533687, + "grad_norm": 0.3175016939640045, + "learning_rate": 7.891172077784288e-05, + "loss": 1.8906, + "step": 385 + }, + { + "epoch": 0.09321419946872736, + "grad_norm": 0.31322944164276123, + "learning_rate": 7.890453483066337e-05, + "loss": 1.8335, + "step": 386 + }, + { + "epoch": 0.09345568703211785, + "grad_norm": 0.324131041765213, + "learning_rate": 7.889732556631243e-05, + "loss": 1.8105, + "step": 387 + }, + { + "epoch": 0.09369717459550833, + "grad_norm": 0.3010997772216797, + "learning_rate": 7.889009298911093e-05, + "loss": 1.654, + "step": 388 + }, + { + "epoch": 0.09393866215889882, + "grad_norm": 0.3338373005390167, + "learning_rate": 7.888283710339364e-05, + "loss": 1.9387, + "step": 389 + }, + { + "epoch": 0.0941801497222893, + "grad_norm": 0.32794541120529175, + "learning_rate": 7.887555791350932e-05, + "loss": 1.7921, + "step": 390 + }, + { + "epoch": 0.09442163728567979, + "grad_norm": 0.30111920833587646, + "learning_rate": 7.886825542382073e-05, + "loss": 1.7964, + "step": 391 + }, + { + "epoch": 0.09466312484907027, + "grad_norm": 0.3243824243545532, + "learning_rate": 7.886092963870453e-05, + "loss": 1.8344, + "step": 392 + }, + { + "epoch": 0.09490461241246076, + "grad_norm": 0.32472896575927734, + "learning_rate": 7.885358056255141e-05, + "loss": 1.83, + "step": 393 + }, + { + "epoch": 0.09514609997585124, + "grad_norm": 0.3036370277404785, + "learning_rate": 7.884620819976599e-05, + "loss": 1.7287, + "step": 394 + }, + { + "epoch": 0.09538758753924173, + "grad_norm": 0.29689764976501465, + "learning_rate": 7.883881255476683e-05, + "loss": 1.6488, + "step": 395 + }, + { + "epoch": 0.09562907510263222, + "grad_norm": 0.3028903901576996, + "learning_rate": 7.883139363198647e-05, + "loss": 1.7084, + "step": 396 + }, + { + "epoch": 0.0958705626660227, + "grad_norm": 0.3292778730392456, + "learning_rate": 7.882395143587139e-05, + "loss": 1.6758, + "step": 397 + }, + { + "epoch": 0.09611205022941319, + "grad_norm": 0.31232085824012756, + "learning_rate": 7.8816485970882e-05, + "loss": 1.717, + "step": 398 + }, + { + "epoch": 0.09635353779280367, + "grad_norm": 0.29923149943351746, + "learning_rate": 7.880899724149272e-05, + "loss": 1.6746, + "step": 399 + }, + { + "epoch": 0.09659502535619416, + "grad_norm": 0.3306083083152771, + "learning_rate": 7.880148525219183e-05, + "loss": 1.8822, + "step": 400 + }, + { + "epoch": 0.09683651291958464, + "grad_norm": 0.3653043508529663, + "learning_rate": 7.879395000748162e-05, + "loss": 1.8299, + "step": 401 + }, + { + "epoch": 0.09707800048297513, + "grad_norm": 0.3056018054485321, + "learning_rate": 7.878639151187826e-05, + "loss": 1.7678, + "step": 402 + }, + { + "epoch": 0.09731948804636562, + "grad_norm": 0.3255383372306824, + "learning_rate": 7.87788097699119e-05, + "loss": 1.7895, + "step": 403 + }, + { + "epoch": 0.0975609756097561, + "grad_norm": 0.33858051896095276, + "learning_rate": 7.87712047861266e-05, + "loss": 1.7134, + "step": 404 + }, + { + "epoch": 0.09780246317314659, + "grad_norm": 0.31411212682724, + "learning_rate": 7.876357656508037e-05, + "loss": 1.8154, + "step": 405 + }, + { + "epoch": 0.09804395073653707, + "grad_norm": 0.3814811408519745, + "learning_rate": 7.87559251113451e-05, + "loss": 2.0315, + "step": 406 + }, + { + "epoch": 0.09828543829992756, + "grad_norm": 0.3188989758491516, + "learning_rate": 7.874825042950668e-05, + "loss": 1.8787, + "step": 407 + }, + { + "epoch": 0.09852692586331804, + "grad_norm": 0.31648024916648865, + "learning_rate": 7.874055252416486e-05, + "loss": 1.8118, + "step": 408 + }, + { + "epoch": 0.09876841342670853, + "grad_norm": 0.3221266269683838, + "learning_rate": 7.87328313999333e-05, + "loss": 1.7063, + "step": 409 + }, + { + "epoch": 0.09900990099009901, + "grad_norm": 0.3123248219490051, + "learning_rate": 7.872508706143966e-05, + "loss": 1.7513, + "step": 410 + }, + { + "epoch": 0.0992513885534895, + "grad_norm": 0.3216148614883423, + "learning_rate": 7.871731951332541e-05, + "loss": 1.7605, + "step": 411 + }, + { + "epoch": 0.09949287611687999, + "grad_norm": 0.32707303762435913, + "learning_rate": 7.8709528760246e-05, + "loss": 1.9001, + "step": 412 + }, + { + "epoch": 0.09973436368027047, + "grad_norm": 0.30996280908584595, + "learning_rate": 7.870171480687076e-05, + "loss": 1.7069, + "step": 413 + }, + { + "epoch": 0.09997585124366096, + "grad_norm": 0.32727497816085815, + "learning_rate": 7.869387765788293e-05, + "loss": 1.9334, + "step": 414 + }, + { + "epoch": 0.10021733880705144, + "grad_norm": 0.33509087562561035, + "learning_rate": 7.868601731797966e-05, + "loss": 1.9259, + "step": 415 + }, + { + "epoch": 0.10045882637044193, + "grad_norm": 0.31665563583374023, + "learning_rate": 7.867813379187197e-05, + "loss": 1.8213, + "step": 416 + }, + { + "epoch": 0.10070031393383241, + "grad_norm": 0.3228408098220825, + "learning_rate": 7.867022708428482e-05, + "loss": 1.984, + "step": 417 + }, + { + "epoch": 0.1009418014972229, + "grad_norm": 0.32034409046173096, + "learning_rate": 7.866229719995705e-05, + "loss": 1.6622, + "step": 418 + }, + { + "epoch": 0.10118328906061339, + "grad_norm": 0.3438382148742676, + "learning_rate": 7.865434414364136e-05, + "loss": 1.888, + "step": 419 + }, + { + "epoch": 0.10142477662400387, + "grad_norm": 0.3029784560203552, + "learning_rate": 7.864636792010437e-05, + "loss": 1.7853, + "step": 420 + }, + { + "epoch": 0.10166626418739434, + "grad_norm": 0.30743077397346497, + "learning_rate": 7.863836853412656e-05, + "loss": 1.8469, + "step": 421 + }, + { + "epoch": 0.10190775175078483, + "grad_norm": 0.2992570102214813, + "learning_rate": 7.863034599050235e-05, + "loss": 1.6541, + "step": 422 + }, + { + "epoch": 0.10214923931417531, + "grad_norm": 0.32089993357658386, + "learning_rate": 7.862230029403995e-05, + "loss": 1.8598, + "step": 423 + }, + { + "epoch": 0.1023907268775658, + "grad_norm": 0.316001832485199, + "learning_rate": 7.861423144956152e-05, + "loss": 1.7655, + "step": 424 + }, + { + "epoch": 0.10263221444095628, + "grad_norm": 0.2942180037498474, + "learning_rate": 7.860613946190306e-05, + "loss": 1.6929, + "step": 425 + }, + { + "epoch": 0.10287370200434677, + "grad_norm": 0.3150692582130432, + "learning_rate": 7.859802433591446e-05, + "loss": 1.8213, + "step": 426 + }, + { + "epoch": 0.10311518956773726, + "grad_norm": 0.32416391372680664, + "learning_rate": 7.858988607645945e-05, + "loss": 1.7896, + "step": 427 + }, + { + "epoch": 0.10335667713112774, + "grad_norm": 0.3014080822467804, + "learning_rate": 7.858172468841565e-05, + "loss": 1.785, + "step": 428 + }, + { + "epoch": 0.10359816469451823, + "grad_norm": 0.2929125428199768, + "learning_rate": 7.857354017667453e-05, + "loss": 1.7752, + "step": 429 + }, + { + "epoch": 0.10383965225790871, + "grad_norm": 0.2827056348323822, + "learning_rate": 7.856533254614143e-05, + "loss": 1.5381, + "step": 430 + }, + { + "epoch": 0.1040811398212992, + "grad_norm": 0.32544657588005066, + "learning_rate": 7.855710180173554e-05, + "loss": 1.8389, + "step": 431 + }, + { + "epoch": 0.10432262738468968, + "grad_norm": 0.32522135972976685, + "learning_rate": 7.854884794838987e-05, + "loss": 1.8001, + "step": 432 + }, + { + "epoch": 0.10456411494808017, + "grad_norm": 0.3139720559120178, + "learning_rate": 7.854057099105135e-05, + "loss": 1.9425, + "step": 433 + }, + { + "epoch": 0.10480560251147066, + "grad_norm": 0.31643447279930115, + "learning_rate": 7.85322709346807e-05, + "loss": 1.8469, + "step": 434 + }, + { + "epoch": 0.10504709007486114, + "grad_norm": 0.3155469000339508, + "learning_rate": 7.852394778425251e-05, + "loss": 1.7791, + "step": 435 + }, + { + "epoch": 0.10528857763825163, + "grad_norm": 0.3019671142101288, + "learning_rate": 7.851560154475519e-05, + "loss": 1.7212, + "step": 436 + }, + { + "epoch": 0.10553006520164211, + "grad_norm": 0.2972007989883423, + "learning_rate": 7.850723222119102e-05, + "loss": 1.6482, + "step": 437 + }, + { + "epoch": 0.1057715527650326, + "grad_norm": 0.31408366560935974, + "learning_rate": 7.84988398185761e-05, + "loss": 1.6854, + "step": 438 + }, + { + "epoch": 0.10601304032842308, + "grad_norm": 0.3040061295032501, + "learning_rate": 7.849042434194033e-05, + "loss": 1.541, + "step": 439 + }, + { + "epoch": 0.10625452789181357, + "grad_norm": 0.3008679151535034, + "learning_rate": 7.848198579632751e-05, + "loss": 1.6453, + "step": 440 + }, + { + "epoch": 0.10649601545520405, + "grad_norm": 0.3023047149181366, + "learning_rate": 7.847352418679519e-05, + "loss": 1.7668, + "step": 441 + }, + { + "epoch": 0.10673750301859454, + "grad_norm": 0.3142707347869873, + "learning_rate": 7.846503951841481e-05, + "loss": 1.8614, + "step": 442 + }, + { + "epoch": 0.10697899058198503, + "grad_norm": 0.3729107677936554, + "learning_rate": 7.845653179627158e-05, + "loss": 1.9223, + "step": 443 + }, + { + "epoch": 0.10722047814537551, + "grad_norm": 0.30760377645492554, + "learning_rate": 7.844800102546455e-05, + "loss": 1.8463, + "step": 444 + }, + { + "epoch": 0.107461965708766, + "grad_norm": 0.30554649233818054, + "learning_rate": 7.843944721110657e-05, + "loss": 1.8305, + "step": 445 + }, + { + "epoch": 0.10770345327215648, + "grad_norm": 0.33122071623802185, + "learning_rate": 7.843087035832433e-05, + "loss": 1.8518, + "step": 446 + }, + { + "epoch": 0.10794494083554697, + "grad_norm": 0.3029637336730957, + "learning_rate": 7.842227047225831e-05, + "loss": 1.7256, + "step": 447 + }, + { + "epoch": 0.10818642839893745, + "grad_norm": 0.3378790616989136, + "learning_rate": 7.841364755806276e-05, + "loss": 1.7585, + "step": 448 + }, + { + "epoch": 0.10842791596232794, + "grad_norm": 0.3104263246059418, + "learning_rate": 7.840500162090581e-05, + "loss": 1.8183, + "step": 449 + }, + { + "epoch": 0.10866940352571842, + "grad_norm": 0.31785666942596436, + "learning_rate": 7.839633266596932e-05, + "loss": 1.766, + "step": 450 + }, + { + "epoch": 0.10891089108910891, + "grad_norm": 0.306466668844223, + "learning_rate": 7.838764069844896e-05, + "loss": 1.8549, + "step": 451 + }, + { + "epoch": 0.1091523786524994, + "grad_norm": 0.32088086009025574, + "learning_rate": 7.837892572355422e-05, + "loss": 1.9489, + "step": 452 + }, + { + "epoch": 0.10939386621588988, + "grad_norm": 0.3011303246021271, + "learning_rate": 7.837018774650837e-05, + "loss": 1.6997, + "step": 453 + }, + { + "epoch": 0.10963535377928037, + "grad_norm": 0.3179236054420471, + "learning_rate": 7.836142677254844e-05, + "loss": 1.7926, + "step": 454 + }, + { + "epoch": 0.10987684134267085, + "grad_norm": 0.3137897551059723, + "learning_rate": 7.835264280692527e-05, + "loss": 1.8318, + "step": 455 + }, + { + "epoch": 0.11011832890606134, + "grad_norm": 0.3057895004749298, + "learning_rate": 7.834383585490347e-05, + "loss": 1.8321, + "step": 456 + }, + { + "epoch": 0.11035981646945182, + "grad_norm": 0.30022135376930237, + "learning_rate": 7.83350059217614e-05, + "loss": 1.7029, + "step": 457 + }, + { + "epoch": 0.11060130403284231, + "grad_norm": 0.32522615790367126, + "learning_rate": 7.832615301279128e-05, + "loss": 1.8882, + "step": 458 + }, + { + "epoch": 0.1108427915962328, + "grad_norm": 0.32540178298950195, + "learning_rate": 7.831727713329899e-05, + "loss": 1.8073, + "step": 459 + }, + { + "epoch": 0.11108427915962328, + "grad_norm": 0.3010394871234894, + "learning_rate": 7.830837828860425e-05, + "loss": 1.743, + "step": 460 + }, + { + "epoch": 0.11132576672301377, + "grad_norm": 0.30483198165893555, + "learning_rate": 7.829945648404051e-05, + "loss": 1.7134, + "step": 461 + }, + { + "epoch": 0.11156725428640425, + "grad_norm": 0.3163911700248718, + "learning_rate": 7.829051172495501e-05, + "loss": 1.7856, + "step": 462 + }, + { + "epoch": 0.11180874184979474, + "grad_norm": 0.31254616379737854, + "learning_rate": 7.828154401670873e-05, + "loss": 1.8231, + "step": 463 + }, + { + "epoch": 0.11205022941318522, + "grad_norm": 0.29678279161453247, + "learning_rate": 7.827255336467639e-05, + "loss": 1.7363, + "step": 464 + }, + { + "epoch": 0.11229171697657571, + "grad_norm": 0.3344740569591522, + "learning_rate": 7.826353977424648e-05, + "loss": 1.9809, + "step": 465 + }, + { + "epoch": 0.1125332045399662, + "grad_norm": 0.3076111078262329, + "learning_rate": 7.825450325082125e-05, + "loss": 1.7802, + "step": 466 + }, + { + "epoch": 0.11277469210335668, + "grad_norm": 0.3477588891983032, + "learning_rate": 7.824544379981667e-05, + "loss": 1.7485, + "step": 467 + }, + { + "epoch": 0.11301617966674717, + "grad_norm": 0.3137153685092926, + "learning_rate": 7.823636142666246e-05, + "loss": 1.843, + "step": 468 + }, + { + "epoch": 0.11325766723013765, + "grad_norm": 0.3323186933994293, + "learning_rate": 7.822725613680208e-05, + "loss": 1.9249, + "step": 469 + }, + { + "epoch": 0.11349915479352814, + "grad_norm": 0.3301668167114258, + "learning_rate": 7.821812793569272e-05, + "loss": 1.7981, + "step": 470 + }, + { + "epoch": 0.11374064235691862, + "grad_norm": 0.30615851283073425, + "learning_rate": 7.820897682880532e-05, + "loss": 1.7497, + "step": 471 + }, + { + "epoch": 0.11398212992030911, + "grad_norm": 0.32018932700157166, + "learning_rate": 7.819980282162453e-05, + "loss": 1.6479, + "step": 472 + }, + { + "epoch": 0.1142236174836996, + "grad_norm": 0.29449161887168884, + "learning_rate": 7.81906059196487e-05, + "loss": 1.7017, + "step": 473 + }, + { + "epoch": 0.11446510504709008, + "grad_norm": 0.38252466917037964, + "learning_rate": 7.818138612838998e-05, + "loss": 1.9334, + "step": 474 + }, + { + "epoch": 0.11470659261048056, + "grad_norm": 0.307882159948349, + "learning_rate": 7.817214345337416e-05, + "loss": 1.7899, + "step": 475 + }, + { + "epoch": 0.11494808017387105, + "grad_norm": 0.33490505814552307, + "learning_rate": 7.816287790014078e-05, + "loss": 1.8565, + "step": 476 + }, + { + "epoch": 0.11518956773726154, + "grad_norm": 0.3033045530319214, + "learning_rate": 7.81535894742431e-05, + "loss": 1.7633, + "step": 477 + }, + { + "epoch": 0.11543105530065202, + "grad_norm": 0.3428524136543274, + "learning_rate": 7.814427818124805e-05, + "loss": 1.9177, + "step": 478 + }, + { + "epoch": 0.11567254286404251, + "grad_norm": 0.2950695753097534, + "learning_rate": 7.813494402673631e-05, + "loss": 1.7384, + "step": 479 + }, + { + "epoch": 0.11591403042743299, + "grad_norm": 0.3069492280483246, + "learning_rate": 7.812558701630223e-05, + "loss": 1.8424, + "step": 480 + }, + { + "epoch": 0.11615551799082348, + "grad_norm": 0.32316040992736816, + "learning_rate": 7.811620715555388e-05, + "loss": 1.8142, + "step": 481 + }, + { + "epoch": 0.11639700555421396, + "grad_norm": 0.3073161840438843, + "learning_rate": 7.810680445011302e-05, + "loss": 1.707, + "step": 482 + }, + { + "epoch": 0.11663849311760445, + "grad_norm": 0.3104289770126343, + "learning_rate": 7.80973789056151e-05, + "loss": 1.6462, + "step": 483 + }, + { + "epoch": 0.11687998068099494, + "grad_norm": 0.3424341082572937, + "learning_rate": 7.808793052770923e-05, + "loss": 1.7575, + "step": 484 + }, + { + "epoch": 0.11712146824438542, + "grad_norm": 0.3131846785545349, + "learning_rate": 7.807845932205829e-05, + "loss": 1.8376, + "step": 485 + }, + { + "epoch": 0.11736295580777589, + "grad_norm": 0.3051791489124298, + "learning_rate": 7.806896529433872e-05, + "loss": 1.7343, + "step": 486 + }, + { + "epoch": 0.11760444337116638, + "grad_norm": 0.33665788173675537, + "learning_rate": 7.805944845024072e-05, + "loss": 2.0152, + "step": 487 + }, + { + "epoch": 0.11784593093455686, + "grad_norm": 0.31663084030151367, + "learning_rate": 7.804990879546817e-05, + "loss": 1.8096, + "step": 488 + }, + { + "epoch": 0.11808741849794735, + "grad_norm": 0.28980234265327454, + "learning_rate": 7.804034633573856e-05, + "loss": 1.5561, + "step": 489 + }, + { + "epoch": 0.11832890606133784, + "grad_norm": 0.3290766775608063, + "learning_rate": 7.803076107678314e-05, + "loss": 1.9711, + "step": 490 + }, + { + "epoch": 0.11857039362472832, + "grad_norm": 0.3099691867828369, + "learning_rate": 7.802115302434671e-05, + "loss": 1.7569, + "step": 491 + }, + { + "epoch": 0.1188118811881188, + "grad_norm": 0.2974450886249542, + "learning_rate": 7.801152218418784e-05, + "loss": 1.6721, + "step": 492 + }, + { + "epoch": 0.11905336875150929, + "grad_norm": 0.30560582876205444, + "learning_rate": 7.800186856207867e-05, + "loss": 1.797, + "step": 493 + }, + { + "epoch": 0.11929485631489978, + "grad_norm": 0.311374694108963, + "learning_rate": 7.799219216380506e-05, + "loss": 1.8303, + "step": 494 + }, + { + "epoch": 0.11953634387829026, + "grad_norm": 0.30325567722320557, + "learning_rate": 7.798249299516649e-05, + "loss": 1.6506, + "step": 495 + }, + { + "epoch": 0.11977783144168075, + "grad_norm": 0.30898353457450867, + "learning_rate": 7.797277106197609e-05, + "loss": 1.7336, + "step": 496 + }, + { + "epoch": 0.12001931900507123, + "grad_norm": 0.3123491704463959, + "learning_rate": 7.796302637006063e-05, + "loss": 1.8833, + "step": 497 + }, + { + "epoch": 0.12026080656846172, + "grad_norm": 0.3127240836620331, + "learning_rate": 7.795325892526054e-05, + "loss": 1.8374, + "step": 498 + }, + { + "epoch": 0.1205022941318522, + "grad_norm": 0.29663944244384766, + "learning_rate": 7.794346873342985e-05, + "loss": 1.7163, + "step": 499 + }, + { + "epoch": 0.12074378169524269, + "grad_norm": 0.30658090114593506, + "learning_rate": 7.793365580043625e-05, + "loss": 1.6681, + "step": 500 + }, + { + "epoch": 0.12098526925863318, + "grad_norm": 0.33467593789100647, + "learning_rate": 7.792382013216108e-05, + "loss": 1.9931, + "step": 501 + }, + { + "epoch": 0.12122675682202366, + "grad_norm": 0.33266401290893555, + "learning_rate": 7.791396173449926e-05, + "loss": 1.9241, + "step": 502 + }, + { + "epoch": 0.12146824438541415, + "grad_norm": 0.32820233702659607, + "learning_rate": 7.790408061335935e-05, + "loss": 1.804, + "step": 503 + }, + { + "epoch": 0.12170973194880463, + "grad_norm": 0.3111535608768463, + "learning_rate": 7.789417677466356e-05, + "loss": 1.8708, + "step": 504 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 0.3077269196510315, + "learning_rate": 7.788425022434766e-05, + "loss": 1.8725, + "step": 505 + }, + { + "epoch": 0.1221927070755856, + "grad_norm": 0.31311750411987305, + "learning_rate": 7.787430096836107e-05, + "loss": 1.7439, + "step": 506 + }, + { + "epoch": 0.12243419463897609, + "grad_norm": 0.3787113428115845, + "learning_rate": 7.786432901266681e-05, + "loss": 1.8704, + "step": 507 + }, + { + "epoch": 0.12267568220236658, + "grad_norm": 0.30987265706062317, + "learning_rate": 7.785433436324153e-05, + "loss": 1.7655, + "step": 508 + }, + { + "epoch": 0.12291716976575706, + "grad_norm": 0.327898234128952, + "learning_rate": 7.78443170260754e-05, + "loss": 1.7135, + "step": 509 + }, + { + "epoch": 0.12315865732914755, + "grad_norm": 0.3035421073436737, + "learning_rate": 7.78342770071723e-05, + "loss": 1.7768, + "step": 510 + }, + { + "epoch": 0.12340014489253803, + "grad_norm": 0.33335283398628235, + "learning_rate": 7.78242143125496e-05, + "loss": 1.8051, + "step": 511 + }, + { + "epoch": 0.12364163245592852, + "grad_norm": 0.2996819317340851, + "learning_rate": 7.781412894823837e-05, + "loss": 1.6147, + "step": 512 + }, + { + "epoch": 0.123883120019319, + "grad_norm": 0.33031609654426575, + "learning_rate": 7.780402092028314e-05, + "loss": 2.0319, + "step": 513 + }, + { + "epoch": 0.12412460758270949, + "grad_norm": 0.33066943287849426, + "learning_rate": 7.779389023474212e-05, + "loss": 1.7073, + "step": 514 + }, + { + "epoch": 0.12436609514609998, + "grad_norm": 0.31213587522506714, + "learning_rate": 7.778373689768707e-05, + "loss": 1.8183, + "step": 515 + }, + { + "epoch": 0.12460758270949046, + "grad_norm": 0.3083159625530243, + "learning_rate": 7.777356091520333e-05, + "loss": 1.6001, + "step": 516 + }, + { + "epoch": 0.12484907027288095, + "grad_norm": 0.3064626455307007, + "learning_rate": 7.776336229338978e-05, + "loss": 1.691, + "step": 517 + }, + { + "epoch": 0.12509055783627143, + "grad_norm": 0.3259071707725525, + "learning_rate": 7.775314103835892e-05, + "loss": 1.7561, + "step": 518 + }, + { + "epoch": 0.12533204539966192, + "grad_norm": 0.3399283289909363, + "learning_rate": 7.774289715623677e-05, + "loss": 1.7772, + "step": 519 + }, + { + "epoch": 0.1255735329630524, + "grad_norm": 0.29182565212249756, + "learning_rate": 7.773263065316296e-05, + "loss": 1.8109, + "step": 520 + }, + { + "epoch": 0.1258150205264429, + "grad_norm": 0.30191174149513245, + "learning_rate": 7.772234153529061e-05, + "loss": 1.8194, + "step": 521 + }, + { + "epoch": 0.12605650808983337, + "grad_norm": 0.3192642629146576, + "learning_rate": 7.771202980878648e-05, + "loss": 1.8612, + "step": 522 + }, + { + "epoch": 0.12629799565322386, + "grad_norm": 0.35645371675491333, + "learning_rate": 7.770169547983081e-05, + "loss": 1.8897, + "step": 523 + }, + { + "epoch": 0.12653948321661435, + "grad_norm": 0.2943851053714752, + "learning_rate": 7.769133855461739e-05, + "loss": 1.7457, + "step": 524 + }, + { + "epoch": 0.12678097078000483, + "grad_norm": 0.30867692828178406, + "learning_rate": 7.768095903935362e-05, + "loss": 1.7291, + "step": 525 + }, + { + "epoch": 0.12702245834339532, + "grad_norm": 0.315302312374115, + "learning_rate": 7.767055694026037e-05, + "loss": 1.7178, + "step": 526 + }, + { + "epoch": 0.1272639459067858, + "grad_norm": 0.31131064891815186, + "learning_rate": 7.766013226357204e-05, + "loss": 1.7799, + "step": 527 + }, + { + "epoch": 0.1275054334701763, + "grad_norm": 0.31616145372390747, + "learning_rate": 7.764968501553663e-05, + "loss": 1.7628, + "step": 528 + }, + { + "epoch": 0.12774692103356677, + "grad_norm": 0.33065786957740784, + "learning_rate": 7.763921520241561e-05, + "loss": 1.8967, + "step": 529 + }, + { + "epoch": 0.12798840859695726, + "grad_norm": 0.31441378593444824, + "learning_rate": 7.762872283048401e-05, + "loss": 1.7663, + "step": 530 + }, + { + "epoch": 0.12822989616034774, + "grad_norm": 0.3112833797931671, + "learning_rate": 7.761820790603032e-05, + "loss": 1.7743, + "step": 531 + }, + { + "epoch": 0.12847138372373823, + "grad_norm": 0.32388782501220703, + "learning_rate": 7.760767043535665e-05, + "loss": 1.6944, + "step": 532 + }, + { + "epoch": 0.12871287128712872, + "grad_norm": 0.32488951086997986, + "learning_rate": 7.759711042477852e-05, + "loss": 1.7904, + "step": 533 + }, + { + "epoch": 0.1289543588505192, + "grad_norm": 0.3164033889770508, + "learning_rate": 7.7586527880625e-05, + "loss": 1.6745, + "step": 534 + }, + { + "epoch": 0.1291958464139097, + "grad_norm": 0.2963300347328186, + "learning_rate": 7.757592280923868e-05, + "loss": 1.6547, + "step": 535 + }, + { + "epoch": 0.12943733397730017, + "grad_norm": 0.4110172390937805, + "learning_rate": 7.756529521697564e-05, + "loss": 1.7822, + "step": 536 + }, + { + "epoch": 0.12967882154069066, + "grad_norm": 0.31125178933143616, + "learning_rate": 7.755464511020546e-05, + "loss": 1.5875, + "step": 537 + }, + { + "epoch": 0.12992030910408114, + "grad_norm": 0.3192461431026459, + "learning_rate": 7.75439724953112e-05, + "loss": 1.7424, + "step": 538 + }, + { + "epoch": 0.13016179666747163, + "grad_norm": 0.3069697320461273, + "learning_rate": 7.75332773786894e-05, + "loss": 1.6848, + "step": 539 + }, + { + "epoch": 0.13040328423086212, + "grad_norm": 0.2977539002895355, + "learning_rate": 7.752255976675016e-05, + "loss": 1.6205, + "step": 540 + }, + { + "epoch": 0.1306447717942526, + "grad_norm": 0.3145061135292053, + "learning_rate": 7.751181966591695e-05, + "loss": 1.7091, + "step": 541 + }, + { + "epoch": 0.1308862593576431, + "grad_norm": 0.3141860067844391, + "learning_rate": 7.750105708262682e-05, + "loss": 1.8047, + "step": 542 + }, + { + "epoch": 0.13112774692103357, + "grad_norm": 0.31395336985588074, + "learning_rate": 7.749027202333023e-05, + "loss": 1.7166, + "step": 543 + }, + { + "epoch": 0.13136923448442406, + "grad_norm": 0.3210203945636749, + "learning_rate": 7.747946449449115e-05, + "loss": 1.8435, + "step": 544 + }, + { + "epoch": 0.13161072204781454, + "grad_norm": 0.32427695393562317, + "learning_rate": 7.746863450258698e-05, + "loss": 1.8571, + "step": 545 + }, + { + "epoch": 0.13185220961120503, + "grad_norm": 0.3196699321269989, + "learning_rate": 7.74577820541086e-05, + "loss": 1.5772, + "step": 546 + }, + { + "epoch": 0.13209369717459551, + "grad_norm": 0.31027519702911377, + "learning_rate": 7.744690715556039e-05, + "loss": 1.6998, + "step": 547 + }, + { + "epoch": 0.132335184737986, + "grad_norm": 0.3263092637062073, + "learning_rate": 7.74360098134601e-05, + "loss": 1.7748, + "step": 548 + }, + { + "epoch": 0.13257667230137649, + "grad_norm": 0.33741897344589233, + "learning_rate": 7.7425090034339e-05, + "loss": 1.919, + "step": 549 + }, + { + "epoch": 0.13281815986476697, + "grad_norm": 0.31624868512153625, + "learning_rate": 7.741414782474179e-05, + "loss": 1.6449, + "step": 550 + }, + { + "epoch": 0.13305964742815746, + "grad_norm": 0.3361668884754181, + "learning_rate": 7.740318319122661e-05, + "loss": 2.002, + "step": 551 + }, + { + "epoch": 0.13330113499154794, + "grad_norm": 0.3008631765842438, + "learning_rate": 7.739219614036504e-05, + "loss": 1.7912, + "step": 552 + }, + { + "epoch": 0.13354262255493843, + "grad_norm": 0.3018030524253845, + "learning_rate": 7.738118667874208e-05, + "loss": 1.7734, + "step": 553 + }, + { + "epoch": 0.1337841101183289, + "grad_norm": 0.31376197934150696, + "learning_rate": 7.737015481295618e-05, + "loss": 1.8312, + "step": 554 + }, + { + "epoch": 0.1340255976817194, + "grad_norm": 0.30704107880592346, + "learning_rate": 7.735910054961924e-05, + "loss": 1.7782, + "step": 555 + }, + { + "epoch": 0.13426708524510989, + "grad_norm": 0.309739351272583, + "learning_rate": 7.734802389535652e-05, + "loss": 1.7463, + "step": 556 + }, + { + "epoch": 0.13450857280850037, + "grad_norm": 0.30903351306915283, + "learning_rate": 7.733692485680677e-05, + "loss": 1.7354, + "step": 557 + }, + { + "epoch": 0.13475006037189086, + "grad_norm": 0.3019959330558777, + "learning_rate": 7.73258034406221e-05, + "loss": 1.6105, + "step": 558 + }, + { + "epoch": 0.13499154793528134, + "grad_norm": 0.30389368534088135, + "learning_rate": 7.731465965346809e-05, + "loss": 1.681, + "step": 559 + }, + { + "epoch": 0.13523303549867183, + "grad_norm": 0.33595752716064453, + "learning_rate": 7.730349350202366e-05, + "loss": 1.9905, + "step": 560 + }, + { + "epoch": 0.1354745230620623, + "grad_norm": 0.31182244420051575, + "learning_rate": 7.729230499298118e-05, + "loss": 1.7488, + "step": 561 + }, + { + "epoch": 0.1357160106254528, + "grad_norm": 0.30236539244651794, + "learning_rate": 7.72810941330464e-05, + "loss": 1.8256, + "step": 562 + }, + { + "epoch": 0.13595749818884328, + "grad_norm": 0.3096561133861542, + "learning_rate": 7.72698609289385e-05, + "loss": 1.6412, + "step": 563 + }, + { + "epoch": 0.13619898575223377, + "grad_norm": 0.317247599363327, + "learning_rate": 7.725860538739e-05, + "loss": 1.8292, + "step": 564 + }, + { + "epoch": 0.13644047331562426, + "grad_norm": 0.3323989808559418, + "learning_rate": 7.724732751514684e-05, + "loss": 2.0581, + "step": 565 + }, + { + "epoch": 0.13668196087901474, + "grad_norm": 0.3129768669605255, + "learning_rate": 7.723602731896833e-05, + "loss": 1.8479, + "step": 566 + }, + { + "epoch": 0.13692344844240523, + "grad_norm": 0.3054257035255432, + "learning_rate": 7.722470480562717e-05, + "loss": 1.7895, + "step": 567 + }, + { + "epoch": 0.1371649360057957, + "grad_norm": 0.3174823820590973, + "learning_rate": 7.721335998190944e-05, + "loss": 1.7581, + "step": 568 + }, + { + "epoch": 0.1374064235691862, + "grad_norm": 0.3012676239013672, + "learning_rate": 7.720199285461459e-05, + "loss": 1.751, + "step": 569 + }, + { + "epoch": 0.13764791113257668, + "grad_norm": 0.30346378684043884, + "learning_rate": 7.719060343055541e-05, + "loss": 1.6166, + "step": 570 + }, + { + "epoch": 0.13788939869596717, + "grad_norm": 0.3010426163673401, + "learning_rate": 7.717919171655809e-05, + "loss": 1.661, + "step": 571 + }, + { + "epoch": 0.13813088625935765, + "grad_norm": 0.32389774918556213, + "learning_rate": 7.716775771946214e-05, + "loss": 1.8483, + "step": 572 + }, + { + "epoch": 0.13837237382274814, + "grad_norm": 0.33098238706588745, + "learning_rate": 7.71563014461205e-05, + "loss": 1.8177, + "step": 573 + }, + { + "epoch": 0.13861386138613863, + "grad_norm": 0.3251645565032959, + "learning_rate": 7.714482290339936e-05, + "loss": 1.922, + "step": 574 + }, + { + "epoch": 0.1388553489495291, + "grad_norm": 0.3154045045375824, + "learning_rate": 7.713332209817832e-05, + "loss": 1.6444, + "step": 575 + }, + { + "epoch": 0.1390968365129196, + "grad_norm": 0.32885581254959106, + "learning_rate": 7.712179903735033e-05, + "loss": 1.699, + "step": 576 + }, + { + "epoch": 0.13933832407631008, + "grad_norm": 0.3207506835460663, + "learning_rate": 7.711025372782164e-05, + "loss": 1.9586, + "step": 577 + }, + { + "epoch": 0.13957981163970057, + "grad_norm": 0.30934199690818787, + "learning_rate": 7.709868617651186e-05, + "loss": 1.6781, + "step": 578 + }, + { + "epoch": 0.13982129920309105, + "grad_norm": 0.35921943187713623, + "learning_rate": 7.708709639035394e-05, + "loss": 2.1063, + "step": 579 + }, + { + "epoch": 0.14006278676648154, + "grad_norm": 0.2932882010936737, + "learning_rate": 7.707548437629411e-05, + "loss": 1.7951, + "step": 580 + }, + { + "epoch": 0.14030427432987203, + "grad_norm": 0.29395684599876404, + "learning_rate": 7.706385014129198e-05, + "loss": 1.5773, + "step": 581 + }, + { + "epoch": 0.1405457618932625, + "grad_norm": 0.31152844429016113, + "learning_rate": 7.705219369232041e-05, + "loss": 1.8562, + "step": 582 + }, + { + "epoch": 0.14078724945665297, + "grad_norm": 0.312266081571579, + "learning_rate": 7.704051503636566e-05, + "loss": 1.6907, + "step": 583 + }, + { + "epoch": 0.14102873702004345, + "grad_norm": 0.3400667905807495, + "learning_rate": 7.702881418042723e-05, + "loss": 1.7507, + "step": 584 + }, + { + "epoch": 0.14127022458343394, + "grad_norm": 0.3202289640903473, + "learning_rate": 7.701709113151795e-05, + "loss": 1.7275, + "step": 585 + }, + { + "epoch": 0.14151171214682443, + "grad_norm": 0.32572951912879944, + "learning_rate": 7.700534589666397e-05, + "loss": 1.8505, + "step": 586 + }, + { + "epoch": 0.1417531997102149, + "grad_norm": 0.3211197555065155, + "learning_rate": 7.699357848290469e-05, + "loss": 1.7782, + "step": 587 + }, + { + "epoch": 0.1419946872736054, + "grad_norm": 0.318483829498291, + "learning_rate": 7.698178889729286e-05, + "loss": 1.869, + "step": 588 + }, + { + "epoch": 0.14223617483699588, + "grad_norm": 0.29991650581359863, + "learning_rate": 7.696997714689445e-05, + "loss": 1.7344, + "step": 589 + }, + { + "epoch": 0.14247766240038637, + "grad_norm": 0.32764291763305664, + "learning_rate": 7.695814323878878e-05, + "loss": 1.9262, + "step": 590 + }, + { + "epoch": 0.14271914996377685, + "grad_norm": 0.3038786053657532, + "learning_rate": 7.694628718006843e-05, + "loss": 1.6972, + "step": 591 + }, + { + "epoch": 0.14296063752716734, + "grad_norm": 0.3058617115020752, + "learning_rate": 7.693440897783923e-05, + "loss": 1.7624, + "step": 592 + }, + { + "epoch": 0.14320212509055782, + "grad_norm": 0.29824337363243103, + "learning_rate": 7.692250863922031e-05, + "loss": 1.6855, + "step": 593 + }, + { + "epoch": 0.1434436126539483, + "grad_norm": 0.3037334084510803, + "learning_rate": 7.691058617134406e-05, + "loss": 1.7016, + "step": 594 + }, + { + "epoch": 0.1436851002173388, + "grad_norm": 0.3068223297595978, + "learning_rate": 7.689864158135612e-05, + "loss": 1.6969, + "step": 595 + }, + { + "epoch": 0.14392658778072928, + "grad_norm": 0.2943803369998932, + "learning_rate": 7.688667487641541e-05, + "loss": 1.6967, + "step": 596 + }, + { + "epoch": 0.14416807534411977, + "grad_norm": 0.3013668954372406, + "learning_rate": 7.687468606369409e-05, + "loss": 1.8011, + "step": 597 + }, + { + "epoch": 0.14440956290751025, + "grad_norm": 0.30205124616622925, + "learning_rate": 7.686267515037758e-05, + "loss": 1.672, + "step": 598 + }, + { + "epoch": 0.14465105047090074, + "grad_norm": 0.3033943474292755, + "learning_rate": 7.685064214366453e-05, + "loss": 1.8067, + "step": 599 + }, + { + "epoch": 0.14489253803429122, + "grad_norm": 0.30258500576019287, + "learning_rate": 7.683858705076684e-05, + "loss": 1.8625, + "step": 600 + }, + { + "epoch": 0.1451340255976817, + "grad_norm": 0.30624958872795105, + "learning_rate": 7.682650987890967e-05, + "loss": 1.8142, + "step": 601 + }, + { + "epoch": 0.1453755131610722, + "grad_norm": 0.30320626497268677, + "learning_rate": 7.681441063533138e-05, + "loss": 1.6951, + "step": 602 + }, + { + "epoch": 0.14561700072446268, + "grad_norm": 0.2955961525440216, + "learning_rate": 7.680228932728357e-05, + "loss": 1.6897, + "step": 603 + }, + { + "epoch": 0.14585848828785317, + "grad_norm": 0.3333013951778412, + "learning_rate": 7.679014596203104e-05, + "loss": 1.8817, + "step": 604 + }, + { + "epoch": 0.14609997585124365, + "grad_norm": 0.30491143465042114, + "learning_rate": 7.677798054685187e-05, + "loss": 1.6913, + "step": 605 + }, + { + "epoch": 0.14634146341463414, + "grad_norm": 0.30941450595855713, + "learning_rate": 7.676579308903732e-05, + "loss": 1.8067, + "step": 606 + }, + { + "epoch": 0.14658295097802462, + "grad_norm": 0.3031584620475769, + "learning_rate": 7.675358359589183e-05, + "loss": 1.8378, + "step": 607 + }, + { + "epoch": 0.1468244385414151, + "grad_norm": 0.32273533940315247, + "learning_rate": 7.67413520747331e-05, + "loss": 1.76, + "step": 608 + }, + { + "epoch": 0.1470659261048056, + "grad_norm": 0.3251398801803589, + "learning_rate": 7.6729098532892e-05, + "loss": 1.8813, + "step": 609 + }, + { + "epoch": 0.14730741366819608, + "grad_norm": 0.3235621750354767, + "learning_rate": 7.671682297771263e-05, + "loss": 1.9019, + "step": 610 + }, + { + "epoch": 0.14754890123158657, + "grad_norm": 0.30117496848106384, + "learning_rate": 7.670452541655224e-05, + "loss": 1.7701, + "step": 611 + }, + { + "epoch": 0.14779038879497705, + "grad_norm": 0.32307854294776917, + "learning_rate": 7.669220585678128e-05, + "loss": 1.908, + "step": 612 + }, + { + "epoch": 0.14803187635836754, + "grad_norm": 0.3234044015407562, + "learning_rate": 7.667986430578343e-05, + "loss": 2.0091, + "step": 613 + }, + { + "epoch": 0.14827336392175802, + "grad_norm": 0.3080267906188965, + "learning_rate": 7.666750077095548e-05, + "loss": 1.8048, + "step": 614 + }, + { + "epoch": 0.1485148514851485, + "grad_norm": 0.3124663233757019, + "learning_rate": 7.665511525970745e-05, + "loss": 1.8464, + "step": 615 + }, + { + "epoch": 0.148756339048539, + "grad_norm": 0.31696856021881104, + "learning_rate": 7.664270777946252e-05, + "loss": 1.8327, + "step": 616 + }, + { + "epoch": 0.14899782661192948, + "grad_norm": 0.3064039349555969, + "learning_rate": 7.663027833765702e-05, + "loss": 1.7434, + "step": 617 + }, + { + "epoch": 0.14923931417531996, + "grad_norm": 0.30550166964530945, + "learning_rate": 7.661782694174044e-05, + "loss": 1.6736, + "step": 618 + }, + { + "epoch": 0.14948080173871045, + "grad_norm": 0.3370753228664398, + "learning_rate": 7.660535359917547e-05, + "loss": 1.7706, + "step": 619 + }, + { + "epoch": 0.14972228930210094, + "grad_norm": 0.3034164309501648, + "learning_rate": 7.659285831743789e-05, + "loss": 1.6429, + "step": 620 + }, + { + "epoch": 0.14996377686549142, + "grad_norm": 0.32384395599365234, + "learning_rate": 7.65803411040167e-05, + "loss": 1.9318, + "step": 621 + }, + { + "epoch": 0.1502052644288819, + "grad_norm": 0.3539518117904663, + "learning_rate": 7.656780196641397e-05, + "loss": 2.1674, + "step": 622 + }, + { + "epoch": 0.1504467519922724, + "grad_norm": 0.3114670217037201, + "learning_rate": 7.655524091214497e-05, + "loss": 1.8364, + "step": 623 + }, + { + "epoch": 0.15068823955566288, + "grad_norm": 0.29424378275871277, + "learning_rate": 7.65426579487381e-05, + "loss": 1.6477, + "step": 624 + }, + { + "epoch": 0.15092972711905336, + "grad_norm": 0.35713982582092285, + "learning_rate": 7.653005308373482e-05, + "loss": 1.8045, + "step": 625 + }, + { + "epoch": 0.15117121468244385, + "grad_norm": 0.30860796570777893, + "learning_rate": 7.651742632468984e-05, + "loss": 1.9516, + "step": 626 + }, + { + "epoch": 0.15141270224583434, + "grad_norm": 0.2912321984767914, + "learning_rate": 7.650477767917087e-05, + "loss": 1.6368, + "step": 627 + }, + { + "epoch": 0.15165418980922482, + "grad_norm": 0.30829671025276184, + "learning_rate": 7.64921071547588e-05, + "loss": 1.865, + "step": 628 + }, + { + "epoch": 0.1518956773726153, + "grad_norm": 0.3148002028465271, + "learning_rate": 7.647941475904765e-05, + "loss": 1.8414, + "step": 629 + }, + { + "epoch": 0.1521371649360058, + "grad_norm": 0.3275551199913025, + "learning_rate": 7.646670049964449e-05, + "loss": 1.7045, + "step": 630 + }, + { + "epoch": 0.15237865249939628, + "grad_norm": 0.3136232793331146, + "learning_rate": 7.645396438416955e-05, + "loss": 1.7327, + "step": 631 + }, + { + "epoch": 0.15262014006278676, + "grad_norm": 0.31527405977249146, + "learning_rate": 7.644120642025613e-05, + "loss": 1.7708, + "step": 632 + }, + { + "epoch": 0.15286162762617725, + "grad_norm": 0.3223884701728821, + "learning_rate": 7.64284266155506e-05, + "loss": 1.8636, + "step": 633 + }, + { + "epoch": 0.15310311518956773, + "grad_norm": 0.31887826323509216, + "learning_rate": 7.64156249777125e-05, + "loss": 1.8202, + "step": 634 + }, + { + "epoch": 0.15334460275295822, + "grad_norm": 0.35070282220840454, + "learning_rate": 7.640280151441439e-05, + "loss": 2.0275, + "step": 635 + }, + { + "epoch": 0.1535860903163487, + "grad_norm": 0.3492684066295624, + "learning_rate": 7.63899562333419e-05, + "loss": 1.8542, + "step": 636 + }, + { + "epoch": 0.1538275778797392, + "grad_norm": 0.30183926224708557, + "learning_rate": 7.637708914219378e-05, + "loss": 1.6828, + "step": 637 + }, + { + "epoch": 0.15406906544312968, + "grad_norm": 0.3578021824359894, + "learning_rate": 7.636420024868184e-05, + "loss": 1.8462, + "step": 638 + }, + { + "epoch": 0.15431055300652016, + "grad_norm": 0.3210180401802063, + "learning_rate": 7.635128956053094e-05, + "loss": 1.7725, + "step": 639 + }, + { + "epoch": 0.15455204056991065, + "grad_norm": 0.30064085125923157, + "learning_rate": 7.633835708547904e-05, + "loss": 1.6716, + "step": 640 + }, + { + "epoch": 0.15479352813330113, + "grad_norm": 0.31479454040527344, + "learning_rate": 7.63254028312771e-05, + "loss": 1.687, + "step": 641 + }, + { + "epoch": 0.15503501569669162, + "grad_norm": 0.3353448510169983, + "learning_rate": 7.631242680568916e-05, + "loss": 2.04, + "step": 642 + }, + { + "epoch": 0.1552765032600821, + "grad_norm": 0.338562548160553, + "learning_rate": 7.629942901649236e-05, + "loss": 1.9637, + "step": 643 + }, + { + "epoch": 0.1555179908234726, + "grad_norm": 0.3367462456226349, + "learning_rate": 7.62864094714768e-05, + "loss": 1.9682, + "step": 644 + }, + { + "epoch": 0.15575947838686308, + "grad_norm": 0.29863229393959045, + "learning_rate": 7.627336817844565e-05, + "loss": 1.6644, + "step": 645 + }, + { + "epoch": 0.15600096595025356, + "grad_norm": 0.31191080808639526, + "learning_rate": 7.626030514521516e-05, + "loss": 1.7951, + "step": 646 + }, + { + "epoch": 0.15624245351364405, + "grad_norm": 0.3201664686203003, + "learning_rate": 7.624722037961453e-05, + "loss": 1.7746, + "step": 647 + }, + { + "epoch": 0.15648394107703453, + "grad_norm": 0.3075219988822937, + "learning_rate": 7.623411388948606e-05, + "loss": 1.6502, + "step": 648 + }, + { + "epoch": 0.15672542864042502, + "grad_norm": 0.31997132301330566, + "learning_rate": 7.622098568268502e-05, + "loss": 1.9077, + "step": 649 + }, + { + "epoch": 0.1569669162038155, + "grad_norm": 0.3383060097694397, + "learning_rate": 7.620783576707971e-05, + "loss": 1.8237, + "step": 650 + }, + { + "epoch": 0.157208403767206, + "grad_norm": 0.29756960272789, + "learning_rate": 7.619466415055146e-05, + "loss": 1.6257, + "step": 651 + }, + { + "epoch": 0.15744989133059648, + "grad_norm": 0.29845669865608215, + "learning_rate": 7.618147084099455e-05, + "loss": 1.7794, + "step": 652 + }, + { + "epoch": 0.15769137889398696, + "grad_norm": 0.2993009090423584, + "learning_rate": 7.616825584631635e-05, + "loss": 1.7485, + "step": 653 + }, + { + "epoch": 0.15793286645737745, + "grad_norm": 0.29437655210494995, + "learning_rate": 7.615501917443715e-05, + "loss": 1.7053, + "step": 654 + }, + { + "epoch": 0.15817435402076793, + "grad_norm": 0.29206910729408264, + "learning_rate": 7.614176083329028e-05, + "loss": 1.6886, + "step": 655 + }, + { + "epoch": 0.15841584158415842, + "grad_norm": 0.3151334524154663, + "learning_rate": 7.6128480830822e-05, + "loss": 1.8169, + "step": 656 + }, + { + "epoch": 0.1586573291475489, + "grad_norm": 0.48860466480255127, + "learning_rate": 7.611517917499164e-05, + "loss": 2.2244, + "step": 657 + }, + { + "epoch": 0.1588988167109394, + "grad_norm": 0.2999897003173828, + "learning_rate": 7.610185587377143e-05, + "loss": 1.6181, + "step": 658 + }, + { + "epoch": 0.15914030427432987, + "grad_norm": 0.3048444390296936, + "learning_rate": 7.608851093514659e-05, + "loss": 1.8048, + "step": 659 + }, + { + "epoch": 0.15938179183772036, + "grad_norm": 0.3159238398075104, + "learning_rate": 7.607514436711534e-05, + "loss": 1.7586, + "step": 660 + }, + { + "epoch": 0.15962327940111085, + "grad_norm": 0.32997772097587585, + "learning_rate": 7.606175617768884e-05, + "loss": 1.8612, + "step": 661 + }, + { + "epoch": 0.15986476696450133, + "grad_norm": 0.30874085426330566, + "learning_rate": 7.60483463748912e-05, + "loss": 1.8608, + "step": 662 + }, + { + "epoch": 0.16010625452789182, + "grad_norm": 0.3253762722015381, + "learning_rate": 7.603491496675951e-05, + "loss": 1.9862, + "step": 663 + }, + { + "epoch": 0.1603477420912823, + "grad_norm": 0.3072294592857361, + "learning_rate": 7.602146196134378e-05, + "loss": 1.6203, + "step": 664 + }, + { + "epoch": 0.1605892296546728, + "grad_norm": 0.32605570554733276, + "learning_rate": 7.6007987366707e-05, + "loss": 1.7996, + "step": 665 + }, + { + "epoch": 0.16083071721806327, + "grad_norm": 0.2969420254230499, + "learning_rate": 7.599449119092504e-05, + "loss": 1.7149, + "step": 666 + }, + { + "epoch": 0.16107220478145376, + "grad_norm": 0.3176361918449402, + "learning_rate": 7.598097344208679e-05, + "loss": 1.7544, + "step": 667 + }, + { + "epoch": 0.16131369234484425, + "grad_norm": 0.3468224108219147, + "learning_rate": 7.596743412829398e-05, + "loss": 1.6971, + "step": 668 + }, + { + "epoch": 0.16155517990823473, + "grad_norm": 0.3290475308895111, + "learning_rate": 7.595387325766133e-05, + "loss": 1.7264, + "step": 669 + }, + { + "epoch": 0.16179666747162522, + "grad_norm": 0.31547752022743225, + "learning_rate": 7.594029083831644e-05, + "loss": 1.7264, + "step": 670 + }, + { + "epoch": 0.1620381550350157, + "grad_norm": 0.3173413872718811, + "learning_rate": 7.592668687839987e-05, + "loss": 1.7354, + "step": 671 + }, + { + "epoch": 0.1622796425984062, + "grad_norm": 0.35088086128234863, + "learning_rate": 7.591306138606502e-05, + "loss": 1.6187, + "step": 672 + }, + { + "epoch": 0.16252113016179667, + "grad_norm": 0.31287261843681335, + "learning_rate": 7.589941436947828e-05, + "loss": 1.7694, + "step": 673 + }, + { + "epoch": 0.16276261772518716, + "grad_norm": 0.35935917496681213, + "learning_rate": 7.588574583681888e-05, + "loss": 1.9953, + "step": 674 + }, + { + "epoch": 0.16300410528857764, + "grad_norm": 0.32228848338127136, + "learning_rate": 7.587205579627896e-05, + "loss": 1.8309, + "step": 675 + }, + { + "epoch": 0.16324559285196813, + "grad_norm": 0.324415385723114, + "learning_rate": 7.585834425606355e-05, + "loss": 1.8214, + "step": 676 + }, + { + "epoch": 0.16348708041535862, + "grad_norm": 0.31052157282829285, + "learning_rate": 7.584461122439057e-05, + "loss": 1.6383, + "step": 677 + }, + { + "epoch": 0.1637285679787491, + "grad_norm": 0.30478107929229736, + "learning_rate": 7.583085670949083e-05, + "loss": 1.6576, + "step": 678 + }, + { + "epoch": 0.1639700555421396, + "grad_norm": 0.29964715242385864, + "learning_rate": 7.581708071960801e-05, + "loss": 1.6084, + "step": 679 + }, + { + "epoch": 0.16421154310553007, + "grad_norm": 0.3476538360118866, + "learning_rate": 7.580328326299863e-05, + "loss": 1.9535, + "step": 680 + }, + { + "epoch": 0.16445303066892056, + "grad_norm": 0.32849040627479553, + "learning_rate": 7.578946434793215e-05, + "loss": 1.8971, + "step": 681 + }, + { + "epoch": 0.16469451823231104, + "grad_norm": 0.3057873845100403, + "learning_rate": 7.577562398269079e-05, + "loss": 1.648, + "step": 682 + }, + { + "epoch": 0.16493600579570153, + "grad_norm": 0.2937708795070648, + "learning_rate": 7.576176217556972e-05, + "loss": 1.6217, + "step": 683 + }, + { + "epoch": 0.16517749335909201, + "grad_norm": 0.30420657992362976, + "learning_rate": 7.57478789348769e-05, + "loss": 1.7992, + "step": 684 + }, + { + "epoch": 0.1654189809224825, + "grad_norm": 0.31237706542015076, + "learning_rate": 7.573397426893316e-05, + "loss": 1.7492, + "step": 685 + }, + { + "epoch": 0.16566046848587299, + "grad_norm": 0.36982595920562744, + "learning_rate": 7.572004818607218e-05, + "loss": 1.7512, + "step": 686 + }, + { + "epoch": 0.16590195604926347, + "grad_norm": 0.2950628697872162, + "learning_rate": 7.570610069464045e-05, + "loss": 1.7111, + "step": 687 + }, + { + "epoch": 0.16614344361265396, + "grad_norm": 0.30206093192100525, + "learning_rate": 7.569213180299732e-05, + "loss": 1.8203, + "step": 688 + }, + { + "epoch": 0.16638493117604444, + "grad_norm": 0.3030879497528076, + "learning_rate": 7.567814151951493e-05, + "loss": 1.7221, + "step": 689 + }, + { + "epoch": 0.16662641873943493, + "grad_norm": 0.3175910711288452, + "learning_rate": 7.566412985257826e-05, + "loss": 1.783, + "step": 690 + }, + { + "epoch": 0.16686790630282541, + "grad_norm": 0.3054318130016327, + "learning_rate": 7.565009681058514e-05, + "loss": 1.679, + "step": 691 + }, + { + "epoch": 0.1671093938662159, + "grad_norm": 0.3093288242816925, + "learning_rate": 7.563604240194616e-05, + "loss": 1.778, + "step": 692 + }, + { + "epoch": 0.16735088142960639, + "grad_norm": 0.3029101490974426, + "learning_rate": 7.562196663508473e-05, + "loss": 1.7636, + "step": 693 + }, + { + "epoch": 0.16759236899299687, + "grad_norm": 0.3081244230270386, + "learning_rate": 7.56078695184371e-05, + "loss": 1.8207, + "step": 694 + }, + { + "epoch": 0.16783385655638736, + "grad_norm": 0.30802908539772034, + "learning_rate": 7.559375106045223e-05, + "loss": 1.7582, + "step": 695 + }, + { + "epoch": 0.16807534411977784, + "grad_norm": 0.32356002926826477, + "learning_rate": 7.557961126959194e-05, + "loss": 1.8012, + "step": 696 + }, + { + "epoch": 0.16831683168316833, + "grad_norm": 0.3083191514015198, + "learning_rate": 7.556545015433084e-05, + "loss": 1.6644, + "step": 697 + }, + { + "epoch": 0.1685583192465588, + "grad_norm": 0.3402654528617859, + "learning_rate": 7.555126772315629e-05, + "loss": 1.8862, + "step": 698 + }, + { + "epoch": 0.1687998068099493, + "grad_norm": 0.3095254898071289, + "learning_rate": 7.553706398456841e-05, + "loss": 1.7341, + "step": 699 + }, + { + "epoch": 0.16904129437333978, + "grad_norm": 0.30369865894317627, + "learning_rate": 7.552283894708015e-05, + "loss": 1.7315, + "step": 700 + }, + { + "epoch": 0.16928278193673027, + "grad_norm": 0.319938600063324, + "learning_rate": 7.550859261921719e-05, + "loss": 1.7972, + "step": 701 + }, + { + "epoch": 0.16952426950012076, + "grad_norm": 0.299113392829895, + "learning_rate": 7.549432500951796e-05, + "loss": 1.7532, + "step": 702 + }, + { + "epoch": 0.16976575706351124, + "grad_norm": 0.29605212807655334, + "learning_rate": 7.548003612653362e-05, + "loss": 1.7625, + "step": 703 + }, + { + "epoch": 0.17000724462690173, + "grad_norm": 0.3049871325492859, + "learning_rate": 7.546572597882818e-05, + "loss": 1.7958, + "step": 704 + }, + { + "epoch": 0.1702487321902922, + "grad_norm": 0.30870726704597473, + "learning_rate": 7.545139457497829e-05, + "loss": 1.7153, + "step": 705 + }, + { + "epoch": 0.1704902197536827, + "grad_norm": 0.31261366605758667, + "learning_rate": 7.54370419235734e-05, + "loss": 1.704, + "step": 706 + }, + { + "epoch": 0.17073170731707318, + "grad_norm": 0.32341545820236206, + "learning_rate": 7.542266803321564e-05, + "loss": 1.6498, + "step": 707 + }, + { + "epoch": 0.17097319488046367, + "grad_norm": 0.3037106990814209, + "learning_rate": 7.540827291251996e-05, + "loss": 1.726, + "step": 708 + }, + { + "epoch": 0.17121468244385415, + "grad_norm": 0.2945062220096588, + "learning_rate": 7.539385657011393e-05, + "loss": 1.6776, + "step": 709 + }, + { + "epoch": 0.17145617000724464, + "grad_norm": 0.3037776052951813, + "learning_rate": 7.537941901463791e-05, + "loss": 1.7051, + "step": 710 + }, + { + "epoch": 0.17169765757063513, + "grad_norm": 0.34461262822151184, + "learning_rate": 7.536496025474496e-05, + "loss": 1.5792, + "step": 711 + }, + { + "epoch": 0.17193914513402558, + "grad_norm": 0.2971360981464386, + "learning_rate": 7.535048029910081e-05, + "loss": 1.7157, + "step": 712 + }, + { + "epoch": 0.17218063269741607, + "grad_norm": 0.3049238324165344, + "learning_rate": 7.533597915638397e-05, + "loss": 1.8328, + "step": 713 + }, + { + "epoch": 0.17242212026080656, + "grad_norm": 0.29996106028556824, + "learning_rate": 7.532145683528555e-05, + "loss": 1.7274, + "step": 714 + }, + { + "epoch": 0.17266360782419704, + "grad_norm": 0.3050224483013153, + "learning_rate": 7.530691334450945e-05, + "loss": 1.6866, + "step": 715 + }, + { + "epoch": 0.17290509538758753, + "grad_norm": 0.3068046569824219, + "learning_rate": 7.529234869277219e-05, + "loss": 1.792, + "step": 716 + }, + { + "epoch": 0.173146582950978, + "grad_norm": 0.3204353451728821, + "learning_rate": 7.5277762888803e-05, + "loss": 1.7847, + "step": 717 + }, + { + "epoch": 0.1733880705143685, + "grad_norm": 0.4433777928352356, + "learning_rate": 7.526315594134378e-05, + "loss": 1.762, + "step": 718 + }, + { + "epoch": 0.17362955807775898, + "grad_norm": 0.3121720254421234, + "learning_rate": 7.524852785914911e-05, + "loss": 1.7186, + "step": 719 + }, + { + "epoch": 0.17387104564114947, + "grad_norm": 0.3399069309234619, + "learning_rate": 7.523387865098624e-05, + "loss": 1.8693, + "step": 720 + }, + { + "epoch": 0.17411253320453995, + "grad_norm": 0.3379225432872772, + "learning_rate": 7.521920832563506e-05, + "loss": 1.7691, + "step": 721 + }, + { + "epoch": 0.17435402076793044, + "grad_norm": 0.30595719814300537, + "learning_rate": 7.520451689188814e-05, + "loss": 1.726, + "step": 722 + }, + { + "epoch": 0.17459550833132093, + "grad_norm": 0.29468265175819397, + "learning_rate": 7.518980435855071e-05, + "loss": 1.673, + "step": 723 + }, + { + "epoch": 0.1748369958947114, + "grad_norm": 0.3205685019493103, + "learning_rate": 7.517507073444059e-05, + "loss": 1.9188, + "step": 724 + }, + { + "epoch": 0.1750784834581019, + "grad_norm": 0.32377034425735474, + "learning_rate": 7.51603160283883e-05, + "loss": 1.7882, + "step": 725 + }, + { + "epoch": 0.17531997102149238, + "grad_norm": 0.32858628034591675, + "learning_rate": 7.514554024923697e-05, + "loss": 1.8163, + "step": 726 + }, + { + "epoch": 0.17556145858488287, + "grad_norm": 0.30413132905960083, + "learning_rate": 7.513074340584237e-05, + "loss": 1.6486, + "step": 727 + }, + { + "epoch": 0.17580294614827335, + "grad_norm": 0.30543509125709534, + "learning_rate": 7.511592550707286e-05, + "loss": 1.6792, + "step": 728 + }, + { + "epoch": 0.17604443371166384, + "grad_norm": 0.3092809319496155, + "learning_rate": 7.51010865618095e-05, + "loss": 1.7791, + "step": 729 + }, + { + "epoch": 0.17628592127505432, + "grad_norm": 0.32701924443244934, + "learning_rate": 7.508622657894588e-05, + "loss": 1.6883, + "step": 730 + }, + { + "epoch": 0.1765274088384448, + "grad_norm": 0.33039215207099915, + "learning_rate": 7.507134556738822e-05, + "loss": 1.9009, + "step": 731 + }, + { + "epoch": 0.1767688964018353, + "grad_norm": 0.3000987470149994, + "learning_rate": 7.505644353605538e-05, + "loss": 1.7143, + "step": 732 + }, + { + "epoch": 0.17701038396522578, + "grad_norm": 0.3035810589790344, + "learning_rate": 7.504152049387878e-05, + "loss": 1.6682, + "step": 733 + }, + { + "epoch": 0.17725187152861627, + "grad_norm": 0.30469194054603577, + "learning_rate": 7.502657644980244e-05, + "loss": 1.7519, + "step": 734 + }, + { + "epoch": 0.17749335909200675, + "grad_norm": 0.30051693320274353, + "learning_rate": 7.501161141278298e-05, + "loss": 1.7051, + "step": 735 + }, + { + "epoch": 0.17773484665539724, + "grad_norm": 0.31448641419410706, + "learning_rate": 7.499662539178958e-05, + "loss": 1.674, + "step": 736 + }, + { + "epoch": 0.17797633421878772, + "grad_norm": 0.321920245885849, + "learning_rate": 7.498161839580405e-05, + "loss": 1.7703, + "step": 737 + }, + { + "epoch": 0.1782178217821782, + "grad_norm": 0.34229952096939087, + "learning_rate": 7.496659043382069e-05, + "loss": 1.7585, + "step": 738 + }, + { + "epoch": 0.1784593093455687, + "grad_norm": 0.2906430661678314, + "learning_rate": 7.495154151484644e-05, + "loss": 1.6548, + "step": 739 + }, + { + "epoch": 0.17870079690895918, + "grad_norm": 0.30244144797325134, + "learning_rate": 7.493647164790074e-05, + "loss": 1.7009, + "step": 740 + }, + { + "epoch": 0.17894228447234967, + "grad_norm": 0.31220030784606934, + "learning_rate": 7.492138084201561e-05, + "loss": 1.8037, + "step": 741 + }, + { + "epoch": 0.17918377203574015, + "grad_norm": 0.2887391149997711, + "learning_rate": 7.490626910623566e-05, + "loss": 1.6375, + "step": 742 + }, + { + "epoch": 0.17942525959913064, + "grad_norm": 0.3369121849536896, + "learning_rate": 7.489113644961797e-05, + "loss": 1.8906, + "step": 743 + }, + { + "epoch": 0.17966674716252112, + "grad_norm": 0.3304523527622223, + "learning_rate": 7.487598288123222e-05, + "loss": 1.8778, + "step": 744 + }, + { + "epoch": 0.1799082347259116, + "grad_norm": 0.3136029541492462, + "learning_rate": 7.486080841016059e-05, + "loss": 1.7777, + "step": 745 + }, + { + "epoch": 0.1801497222893021, + "grad_norm": 0.3245154619216919, + "learning_rate": 7.48456130454978e-05, + "loss": 1.7184, + "step": 746 + }, + { + "epoch": 0.18039120985269258, + "grad_norm": 0.2960347533226013, + "learning_rate": 7.48303967963511e-05, + "loss": 1.6104, + "step": 747 + }, + { + "epoch": 0.18063269741608307, + "grad_norm": 0.31395086646080017, + "learning_rate": 7.481515967184021e-05, + "loss": 1.8418, + "step": 748 + }, + { + "epoch": 0.18087418497947355, + "grad_norm": 0.30373284220695496, + "learning_rate": 7.479990168109744e-05, + "loss": 1.7451, + "step": 749 + }, + { + "epoch": 0.18111567254286404, + "grad_norm": 0.3072919249534607, + "learning_rate": 7.478462283326754e-05, + "loss": 1.7898, + "step": 750 + }, + { + "epoch": 0.18135716010625452, + "grad_norm": 0.2961108386516571, + "learning_rate": 7.476932313750779e-05, + "loss": 1.6443, + "step": 751 + }, + { + "epoch": 0.181598647669645, + "grad_norm": 0.3386465609073639, + "learning_rate": 7.475400260298797e-05, + "loss": 1.9018, + "step": 752 + }, + { + "epoch": 0.1818401352330355, + "grad_norm": 0.3179508447647095, + "learning_rate": 7.473866123889032e-05, + "loss": 1.7945, + "step": 753 + }, + { + "epoch": 0.18208162279642598, + "grad_norm": 0.30372482538223267, + "learning_rate": 7.472329905440961e-05, + "loss": 1.7731, + "step": 754 + }, + { + "epoch": 0.18232311035981646, + "grad_norm": 0.2982485890388489, + "learning_rate": 7.470791605875302e-05, + "loss": 1.7926, + "step": 755 + }, + { + "epoch": 0.18256459792320695, + "grad_norm": 0.3064810633659363, + "learning_rate": 7.46925122611403e-05, + "loss": 1.7192, + "step": 756 + }, + { + "epoch": 0.18280608548659744, + "grad_norm": 0.3069106340408325, + "learning_rate": 7.467708767080358e-05, + "loss": 1.7361, + "step": 757 + }, + { + "epoch": 0.18304757304998792, + "grad_norm": 0.31539079546928406, + "learning_rate": 7.466164229698747e-05, + "loss": 1.7761, + "step": 758 + }, + { + "epoch": 0.1832890606133784, + "grad_norm": 0.3114735782146454, + "learning_rate": 7.464617614894908e-05, + "loss": 1.8215, + "step": 759 + }, + { + "epoch": 0.1835305481767689, + "grad_norm": 0.3202139437198639, + "learning_rate": 7.463068923595792e-05, + "loss": 1.7645, + "step": 760 + }, + { + "epoch": 0.18377203574015938, + "grad_norm": 0.2983264625072479, + "learning_rate": 7.461518156729599e-05, + "loss": 1.8844, + "step": 761 + }, + { + "epoch": 0.18401352330354986, + "grad_norm": 0.32856181263923645, + "learning_rate": 7.45996531522577e-05, + "loss": 1.9035, + "step": 762 + }, + { + "epoch": 0.18425501086694035, + "grad_norm": 0.31148043274879456, + "learning_rate": 7.45841040001499e-05, + "loss": 1.837, + "step": 763 + }, + { + "epoch": 0.18449649843033084, + "grad_norm": 0.30822792649269104, + "learning_rate": 7.456853412029184e-05, + "loss": 1.7931, + "step": 764 + }, + { + "epoch": 0.18473798599372132, + "grad_norm": 0.3262118101119995, + "learning_rate": 7.455294352201528e-05, + "loss": 1.8056, + "step": 765 + }, + { + "epoch": 0.1849794735571118, + "grad_norm": 0.32270196080207825, + "learning_rate": 7.453733221466429e-05, + "loss": 1.8264, + "step": 766 + }, + { + "epoch": 0.1852209611205023, + "grad_norm": 0.32929688692092896, + "learning_rate": 7.452170020759542e-05, + "loss": 1.8021, + "step": 767 + }, + { + "epoch": 0.18546244868389278, + "grad_norm": 0.3017376661300659, + "learning_rate": 7.450604751017762e-05, + "loss": 1.7503, + "step": 768 + }, + { + "epoch": 0.18570393624728326, + "grad_norm": 0.3037002980709076, + "learning_rate": 7.449037413179222e-05, + "loss": 1.6066, + "step": 769 + }, + { + "epoch": 0.18594542381067375, + "grad_norm": 0.3149946630001068, + "learning_rate": 7.447468008183295e-05, + "loss": 1.7274, + "step": 770 + }, + { + "epoch": 0.18618691137406423, + "grad_norm": 0.3143094480037689, + "learning_rate": 7.445896536970592e-05, + "loss": 1.7744, + "step": 771 + }, + { + "epoch": 0.18642839893745472, + "grad_norm": 0.32124435901641846, + "learning_rate": 7.444323000482968e-05, + "loss": 1.8213, + "step": 772 + }, + { + "epoch": 0.1866698865008452, + "grad_norm": 0.3078225255012512, + "learning_rate": 7.442747399663507e-05, + "loss": 1.6668, + "step": 773 + }, + { + "epoch": 0.1869113740642357, + "grad_norm": 0.3109733462333679, + "learning_rate": 7.441169735456537e-05, + "loss": 1.7679, + "step": 774 + }, + { + "epoch": 0.18715286162762618, + "grad_norm": 0.3216419816017151, + "learning_rate": 7.439590008807621e-05, + "loss": 1.8956, + "step": 775 + }, + { + "epoch": 0.18739434919101666, + "grad_norm": 0.3254031836986542, + "learning_rate": 7.438008220663556e-05, + "loss": 1.8686, + "step": 776 + }, + { + "epoch": 0.18763583675440715, + "grad_norm": 0.31165701150894165, + "learning_rate": 7.436424371972376e-05, + "loss": 1.6975, + "step": 777 + }, + { + "epoch": 0.18787732431779763, + "grad_norm": 0.3033682703971863, + "learning_rate": 7.43483846368335e-05, + "loss": 1.7078, + "step": 778 + }, + { + "epoch": 0.18811881188118812, + "grad_norm": 0.3072250783443451, + "learning_rate": 7.433250496746985e-05, + "loss": 1.6495, + "step": 779 + }, + { + "epoch": 0.1883602994445786, + "grad_norm": 0.3142796754837036, + "learning_rate": 7.431660472115013e-05, + "loss": 1.6211, + "step": 780 + }, + { + "epoch": 0.1886017870079691, + "grad_norm": 0.31042274832725525, + "learning_rate": 7.430068390740409e-05, + "loss": 1.7299, + "step": 781 + }, + { + "epoch": 0.18884327457135958, + "grad_norm": 0.34713032841682434, + "learning_rate": 7.428474253577372e-05, + "loss": 1.9567, + "step": 782 + }, + { + "epoch": 0.18908476213475006, + "grad_norm": 0.30852535367012024, + "learning_rate": 7.426878061581342e-05, + "loss": 1.8149, + "step": 783 + }, + { + "epoch": 0.18932624969814055, + "grad_norm": 0.30727940797805786, + "learning_rate": 7.425279815708981e-05, + "loss": 1.7005, + "step": 784 + }, + { + "epoch": 0.18956773726153103, + "grad_norm": 0.3225751221179962, + "learning_rate": 7.423679516918192e-05, + "loss": 1.9, + "step": 785 + }, + { + "epoch": 0.18980922482492152, + "grad_norm": 0.3211837112903595, + "learning_rate": 7.4220771661681e-05, + "loss": 1.7269, + "step": 786 + }, + { + "epoch": 0.190050712388312, + "grad_norm": 0.3182501196861267, + "learning_rate": 7.420472764419065e-05, + "loss": 1.7219, + "step": 787 + }, + { + "epoch": 0.1902921999517025, + "grad_norm": 0.32098039984703064, + "learning_rate": 7.418866312632673e-05, + "loss": 1.7289, + "step": 788 + }, + { + "epoch": 0.19053368751509298, + "grad_norm": 0.3143134117126465, + "learning_rate": 7.41725781177174e-05, + "loss": 1.7644, + "step": 789 + }, + { + "epoch": 0.19077517507848346, + "grad_norm": 0.3101259469985962, + "learning_rate": 7.415647262800311e-05, + "loss": 1.7912, + "step": 790 + }, + { + "epoch": 0.19101666264187395, + "grad_norm": 0.33307313919067383, + "learning_rate": 7.414034666683657e-05, + "loss": 1.8878, + "step": 791 + }, + { + "epoch": 0.19125815020526443, + "grad_norm": 0.31299763917922974, + "learning_rate": 7.412420024388279e-05, + "loss": 1.7598, + "step": 792 + }, + { + "epoch": 0.19149963776865492, + "grad_norm": 0.2992435693740845, + "learning_rate": 7.410803336881898e-05, + "loss": 1.6938, + "step": 793 + }, + { + "epoch": 0.1917411253320454, + "grad_norm": 0.3298616111278534, + "learning_rate": 7.409184605133468e-05, + "loss": 1.7812, + "step": 794 + }, + { + "epoch": 0.1919826128954359, + "grad_norm": 0.3269992768764496, + "learning_rate": 7.407563830113163e-05, + "loss": 1.916, + "step": 795 + }, + { + "epoch": 0.19222410045882637, + "grad_norm": 0.3093508780002594, + "learning_rate": 7.405941012792385e-05, + "loss": 1.8387, + "step": 796 + }, + { + "epoch": 0.19246558802221686, + "grad_norm": 0.34046638011932373, + "learning_rate": 7.404316154143757e-05, + "loss": 2.0301, + "step": 797 + }, + { + "epoch": 0.19270707558560735, + "grad_norm": 0.32975658774375916, + "learning_rate": 7.40268925514113e-05, + "loss": 1.8212, + "step": 798 + }, + { + "epoch": 0.19294856314899783, + "grad_norm": 0.3060869872570038, + "learning_rate": 7.401060316759574e-05, + "loss": 1.7069, + "step": 799 + }, + { + "epoch": 0.19319005071238832, + "grad_norm": 0.31247884035110474, + "learning_rate": 7.399429339975379e-05, + "loss": 1.8025, + "step": 800 + }, + { + "epoch": 0.1934315382757788, + "grad_norm": 0.31649911403656006, + "learning_rate": 7.397796325766063e-05, + "loss": 1.7576, + "step": 801 + }, + { + "epoch": 0.1936730258391693, + "grad_norm": 0.3336128294467926, + "learning_rate": 7.396161275110362e-05, + "loss": 1.873, + "step": 802 + }, + { + "epoch": 0.19391451340255977, + "grad_norm": 0.31450867652893066, + "learning_rate": 7.394524188988232e-05, + "loss": 1.8446, + "step": 803 + }, + { + "epoch": 0.19415600096595026, + "grad_norm": 0.3035898506641388, + "learning_rate": 7.39288506838085e-05, + "loss": 1.6569, + "step": 804 + }, + { + "epoch": 0.19439748852934075, + "grad_norm": 0.3701397478580475, + "learning_rate": 7.39124391427061e-05, + "loss": 1.6752, + "step": 805 + }, + { + "epoch": 0.19463897609273123, + "grad_norm": 0.32577869296073914, + "learning_rate": 7.389600727641131e-05, + "loss": 1.7124, + "step": 806 + }, + { + "epoch": 0.19488046365612172, + "grad_norm": 0.3136950433254242, + "learning_rate": 7.387955509477242e-05, + "loss": 1.7551, + "step": 807 + }, + { + "epoch": 0.1951219512195122, + "grad_norm": 0.31354862451553345, + "learning_rate": 7.386308260764995e-05, + "loss": 1.7758, + "step": 808 + }, + { + "epoch": 0.1953634387829027, + "grad_norm": 0.38379475474357605, + "learning_rate": 7.384658982491657e-05, + "loss": 1.8878, + "step": 809 + }, + { + "epoch": 0.19560492634629317, + "grad_norm": 0.3336881101131439, + "learning_rate": 7.383007675645712e-05, + "loss": 1.9188, + "step": 810 + }, + { + "epoch": 0.19584641390968366, + "grad_norm": 0.2996203303337097, + "learning_rate": 7.381354341216858e-05, + "loss": 1.5913, + "step": 811 + }, + { + "epoch": 0.19608790147307414, + "grad_norm": 0.33541610836982727, + "learning_rate": 7.379698980196013e-05, + "loss": 1.7095, + "step": 812 + }, + { + "epoch": 0.19632938903646463, + "grad_norm": 0.30465638637542725, + "learning_rate": 7.378041593575305e-05, + "loss": 1.6976, + "step": 813 + }, + { + "epoch": 0.19657087659985512, + "grad_norm": 0.30584558844566345, + "learning_rate": 7.376382182348076e-05, + "loss": 1.5261, + "step": 814 + }, + { + "epoch": 0.1968123641632456, + "grad_norm": 0.34166428446769714, + "learning_rate": 7.374720747508885e-05, + "loss": 1.9264, + "step": 815 + }, + { + "epoch": 0.1970538517266361, + "grad_norm": 0.3009068965911865, + "learning_rate": 7.373057290053502e-05, + "loss": 1.8388, + "step": 816 + }, + { + "epoch": 0.19729533929002657, + "grad_norm": 0.3169417381286621, + "learning_rate": 7.371391810978909e-05, + "loss": 1.6588, + "step": 817 + }, + { + "epoch": 0.19753682685341706, + "grad_norm": 0.30887770652770996, + "learning_rate": 7.369724311283296e-05, + "loss": 1.7986, + "step": 818 + }, + { + "epoch": 0.19777831441680754, + "grad_norm": 0.31081607937812805, + "learning_rate": 7.368054791966073e-05, + "loss": 1.6954, + "step": 819 + }, + { + "epoch": 0.19801980198019803, + "grad_norm": 0.3122561275959015, + "learning_rate": 7.366383254027853e-05, + "loss": 1.738, + "step": 820 + }, + { + "epoch": 0.19826128954358851, + "grad_norm": 0.30774828791618347, + "learning_rate": 7.36470969847046e-05, + "loss": 1.7573, + "step": 821 + }, + { + "epoch": 0.198502777106979, + "grad_norm": 0.29315096139907837, + "learning_rate": 7.36303412629693e-05, + "loss": 1.6716, + "step": 822 + }, + { + "epoch": 0.19874426467036949, + "grad_norm": 0.30455148220062256, + "learning_rate": 7.361356538511506e-05, + "loss": 1.7381, + "step": 823 + }, + { + "epoch": 0.19898575223375997, + "grad_norm": 0.3003849387168884, + "learning_rate": 7.359676936119635e-05, + "loss": 1.7738, + "step": 824 + }, + { + "epoch": 0.19922723979715046, + "grad_norm": 0.3111303746700287, + "learning_rate": 7.357995320127981e-05, + "loss": 1.8793, + "step": 825 + }, + { + "epoch": 0.19946872736054094, + "grad_norm": 0.31026238203048706, + "learning_rate": 7.356311691544406e-05, + "loss": 1.6743, + "step": 826 + }, + { + "epoch": 0.19971021492393143, + "grad_norm": 0.30901458859443665, + "learning_rate": 7.354626051377981e-05, + "loss": 1.6457, + "step": 827 + }, + { + "epoch": 0.19995170248732191, + "grad_norm": 0.3286570906639099, + "learning_rate": 7.352938400638986e-05, + "loss": 1.9002, + "step": 828 + }, + { + "epoch": 0.2001931900507124, + "grad_norm": 0.3596792221069336, + "learning_rate": 7.3512487403389e-05, + "loss": 2.0187, + "step": 829 + }, + { + "epoch": 0.20043467761410289, + "grad_norm": 0.325364887714386, + "learning_rate": 7.349557071490411e-05, + "loss": 1.9584, + "step": 830 + }, + { + "epoch": 0.20067616517749337, + "grad_norm": 0.3105791509151459, + "learning_rate": 7.347863395107411e-05, + "loss": 1.7492, + "step": 831 + }, + { + "epoch": 0.20091765274088386, + "grad_norm": 0.3149196207523346, + "learning_rate": 7.346167712204991e-05, + "loss": 1.7646, + "step": 832 + }, + { + "epoch": 0.20115914030427434, + "grad_norm": 0.31853047013282776, + "learning_rate": 7.344470023799447e-05, + "loss": 1.7379, + "step": 833 + }, + { + "epoch": 0.20140062786766483, + "grad_norm": 0.31346553564071655, + "learning_rate": 7.34277033090828e-05, + "loss": 1.7015, + "step": 834 + }, + { + "epoch": 0.2016421154310553, + "grad_norm": 0.31461572647094727, + "learning_rate": 7.341068634550185e-05, + "loss": 1.8486, + "step": 835 + }, + { + "epoch": 0.2018836029944458, + "grad_norm": 0.31537574529647827, + "learning_rate": 7.339364935745067e-05, + "loss": 1.6802, + "step": 836 + }, + { + "epoch": 0.20212509055783628, + "grad_norm": 0.3205339014530182, + "learning_rate": 7.337659235514024e-05, + "loss": 1.7981, + "step": 837 + }, + { + "epoch": 0.20236657812122677, + "grad_norm": 0.3003098666667938, + "learning_rate": 7.335951534879356e-05, + "loss": 1.7005, + "step": 838 + }, + { + "epoch": 0.20260806568461726, + "grad_norm": 0.3131178617477417, + "learning_rate": 7.334241834864562e-05, + "loss": 1.6863, + "step": 839 + }, + { + "epoch": 0.20284955324800774, + "grad_norm": 0.3473765552043915, + "learning_rate": 7.33253013649434e-05, + "loss": 2.0212, + "step": 840 + }, + { + "epoch": 0.20309104081139823, + "grad_norm": 0.2994740307331085, + "learning_rate": 7.330816440794585e-05, + "loss": 1.7631, + "step": 841 + }, + { + "epoch": 0.20333252837478868, + "grad_norm": 0.29169461131095886, + "learning_rate": 7.329100748792387e-05, + "loss": 1.5282, + "step": 842 + }, + { + "epoch": 0.20357401593817917, + "grad_norm": 0.3285301625728607, + "learning_rate": 7.327383061516035e-05, + "loss": 1.8478, + "step": 843 + }, + { + "epoch": 0.20381550350156966, + "grad_norm": 0.2975311279296875, + "learning_rate": 7.325663379995016e-05, + "loss": 1.6736, + "step": 844 + }, + { + "epoch": 0.20405699106496014, + "grad_norm": 0.30266156792640686, + "learning_rate": 7.323941705260006e-05, + "loss": 1.7203, + "step": 845 + }, + { + "epoch": 0.20429847862835063, + "grad_norm": 0.29713326692581177, + "learning_rate": 7.322218038342881e-05, + "loss": 1.6709, + "step": 846 + }, + { + "epoch": 0.2045399661917411, + "grad_norm": 0.29916587471961975, + "learning_rate": 7.320492380276711e-05, + "loss": 1.6718, + "step": 847 + }, + { + "epoch": 0.2047814537551316, + "grad_norm": 0.31391361355781555, + "learning_rate": 7.318764732095753e-05, + "loss": 1.8098, + "step": 848 + }, + { + "epoch": 0.20502294131852208, + "grad_norm": 0.3385540843009949, + "learning_rate": 7.317035094835467e-05, + "loss": 1.916, + "step": 849 + }, + { + "epoch": 0.20526442888191257, + "grad_norm": 0.31298068165779114, + "learning_rate": 7.315303469532494e-05, + "loss": 1.7697, + "step": 850 + }, + { + "epoch": 0.20550591644530306, + "grad_norm": 0.30968594551086426, + "learning_rate": 7.313569857224674e-05, + "loss": 1.7315, + "step": 851 + }, + { + "epoch": 0.20574740400869354, + "grad_norm": 0.301782488822937, + "learning_rate": 7.311834258951038e-05, + "loss": 1.6389, + "step": 852 + }, + { + "epoch": 0.20598889157208403, + "grad_norm": 0.3327527940273285, + "learning_rate": 7.310096675751802e-05, + "loss": 1.9758, + "step": 853 + }, + { + "epoch": 0.2062303791354745, + "grad_norm": 0.3157320022583008, + "learning_rate": 7.308357108668377e-05, + "loss": 1.9141, + "step": 854 + }, + { + "epoch": 0.206471866698865, + "grad_norm": 0.3155609667301178, + "learning_rate": 7.306615558743358e-05, + "loss": 1.741, + "step": 855 + }, + { + "epoch": 0.20671335426225548, + "grad_norm": 0.326360285282135, + "learning_rate": 7.304872027020536e-05, + "loss": 1.9365, + "step": 856 + }, + { + "epoch": 0.20695484182564597, + "grad_norm": 0.31044507026672363, + "learning_rate": 7.303126514544881e-05, + "loss": 1.7731, + "step": 857 + }, + { + "epoch": 0.20719632938903645, + "grad_norm": 0.3183859884738922, + "learning_rate": 7.301379022362556e-05, + "loss": 1.85, + "step": 858 + }, + { + "epoch": 0.20743781695242694, + "grad_norm": 0.31985989212989807, + "learning_rate": 7.299629551520908e-05, + "loss": 1.796, + "step": 859 + }, + { + "epoch": 0.20767930451581743, + "grad_norm": 0.33011680841445923, + "learning_rate": 7.297878103068471e-05, + "loss": 1.9272, + "step": 860 + }, + { + "epoch": 0.2079207920792079, + "grad_norm": 0.31791943311691284, + "learning_rate": 7.296124678054963e-05, + "loss": 1.5777, + "step": 861 + }, + { + "epoch": 0.2081622796425984, + "grad_norm": 0.29987606406211853, + "learning_rate": 7.294369277531287e-05, + "loss": 1.7308, + "step": 862 + }, + { + "epoch": 0.20840376720598888, + "grad_norm": 0.3212811052799225, + "learning_rate": 7.292611902549534e-05, + "loss": 1.7248, + "step": 863 + }, + { + "epoch": 0.20864525476937937, + "grad_norm": 0.31155189871788025, + "learning_rate": 7.290852554162972e-05, + "loss": 1.6799, + "step": 864 + }, + { + "epoch": 0.20888674233276985, + "grad_norm": 0.30869436264038086, + "learning_rate": 7.289091233426054e-05, + "loss": 1.7143, + "step": 865 + }, + { + "epoch": 0.20912822989616034, + "grad_norm": 0.30815213918685913, + "learning_rate": 7.287327941394416e-05, + "loss": 1.6345, + "step": 866 + }, + { + "epoch": 0.20936971745955082, + "grad_norm": 0.32789891958236694, + "learning_rate": 7.285562679124878e-05, + "loss": 1.9442, + "step": 867 + }, + { + "epoch": 0.2096112050229413, + "grad_norm": 0.3645618259906769, + "learning_rate": 7.283795447675435e-05, + "loss": 1.9534, + "step": 868 + }, + { + "epoch": 0.2098526925863318, + "grad_norm": 0.311452180147171, + "learning_rate": 7.282026248105268e-05, + "loss": 1.9428, + "step": 869 + }, + { + "epoch": 0.21009418014972228, + "grad_norm": 0.3184005916118622, + "learning_rate": 7.280255081474731e-05, + "loss": 1.8433, + "step": 870 + }, + { + "epoch": 0.21033566771311277, + "grad_norm": 0.3258514404296875, + "learning_rate": 7.278481948845364e-05, + "loss": 1.9518, + "step": 871 + }, + { + "epoch": 0.21057715527650325, + "grad_norm": 0.29278308153152466, + "learning_rate": 7.276706851279883e-05, + "loss": 1.6152, + "step": 872 + }, + { + "epoch": 0.21081864283989374, + "grad_norm": 0.30696970224380493, + "learning_rate": 7.274929789842177e-05, + "loss": 1.7308, + "step": 873 + }, + { + "epoch": 0.21106013040328422, + "grad_norm": 0.3068031668663025, + "learning_rate": 7.273150765597319e-05, + "loss": 1.7358, + "step": 874 + }, + { + "epoch": 0.2113016179666747, + "grad_norm": 0.31447917222976685, + "learning_rate": 7.271369779611553e-05, + "loss": 1.7537, + "step": 875 + }, + { + "epoch": 0.2115431055300652, + "grad_norm": 0.31334996223449707, + "learning_rate": 7.269586832952303e-05, + "loss": 1.7341, + "step": 876 + }, + { + "epoch": 0.21178459309345568, + "grad_norm": 0.2954760193824768, + "learning_rate": 7.267801926688164e-05, + "loss": 1.6958, + "step": 877 + }, + { + "epoch": 0.21202608065684617, + "grad_norm": 0.30289843678474426, + "learning_rate": 7.26601506188891e-05, + "loss": 1.6662, + "step": 878 + }, + { + "epoch": 0.21226756822023665, + "grad_norm": 0.31064343452453613, + "learning_rate": 7.264226239625484e-05, + "loss": 1.6904, + "step": 879 + }, + { + "epoch": 0.21250905578362714, + "grad_norm": 0.32711198925971985, + "learning_rate": 7.262435460970006e-05, + "loss": 1.8815, + "step": 880 + }, + { + "epoch": 0.21275054334701762, + "grad_norm": 0.314898282289505, + "learning_rate": 7.260642726995768e-05, + "loss": 1.8043, + "step": 881 + }, + { + "epoch": 0.2129920309104081, + "grad_norm": 0.34022215008735657, + "learning_rate": 7.25884803877723e-05, + "loss": 1.8118, + "step": 882 + }, + { + "epoch": 0.2132335184737986, + "grad_norm": 0.29776495695114136, + "learning_rate": 7.25705139739003e-05, + "loss": 1.6633, + "step": 883 + }, + { + "epoch": 0.21347500603718908, + "grad_norm": 0.3354003429412842, + "learning_rate": 7.25525280391097e-05, + "loss": 1.8794, + "step": 884 + }, + { + "epoch": 0.21371649360057957, + "grad_norm": 0.3095185160636902, + "learning_rate": 7.253452259418027e-05, + "loss": 1.6259, + "step": 885 + }, + { + "epoch": 0.21395798116397005, + "grad_norm": 0.2914026379585266, + "learning_rate": 7.251649764990343e-05, + "loss": 1.6233, + "step": 886 + }, + { + "epoch": 0.21419946872736054, + "grad_norm": 0.3036092221736908, + "learning_rate": 7.249845321708234e-05, + "loss": 1.7473, + "step": 887 + }, + { + "epoch": 0.21444095629075102, + "grad_norm": 0.31751853227615356, + "learning_rate": 7.248038930653178e-05, + "loss": 1.7744, + "step": 888 + }, + { + "epoch": 0.2146824438541415, + "grad_norm": 0.31027981638908386, + "learning_rate": 7.246230592907824e-05, + "loss": 1.7248, + "step": 889 + }, + { + "epoch": 0.214923931417532, + "grad_norm": 0.3045722246170044, + "learning_rate": 7.244420309555989e-05, + "loss": 1.7947, + "step": 890 + }, + { + "epoch": 0.21516541898092248, + "grad_norm": 0.30841004848480225, + "learning_rate": 7.242608081682653e-05, + "loss": 1.6543, + "step": 891 + }, + { + "epoch": 0.21540690654431296, + "grad_norm": 0.31386974453926086, + "learning_rate": 7.24079391037396e-05, + "loss": 1.787, + "step": 892 + }, + { + "epoch": 0.21564839410770345, + "grad_norm": 0.3163403868675232, + "learning_rate": 7.238977796717225e-05, + "loss": 1.6371, + "step": 893 + }, + { + "epoch": 0.21588988167109394, + "grad_norm": 0.32152020931243896, + "learning_rate": 7.237159741800923e-05, + "loss": 1.7653, + "step": 894 + }, + { + "epoch": 0.21613136923448442, + "grad_norm": 0.31930601596832275, + "learning_rate": 7.235339746714693e-05, + "loss": 1.7829, + "step": 895 + }, + { + "epoch": 0.2163728567978749, + "grad_norm": 0.2922723889350891, + "learning_rate": 7.233517812549334e-05, + "loss": 1.6658, + "step": 896 + }, + { + "epoch": 0.2166143443612654, + "grad_norm": 0.3041679561138153, + "learning_rate": 7.231693940396811e-05, + "loss": 1.7421, + "step": 897 + }, + { + "epoch": 0.21685583192465588, + "grad_norm": 0.30257824063301086, + "learning_rate": 7.229868131350254e-05, + "loss": 1.7724, + "step": 898 + }, + { + "epoch": 0.21709731948804636, + "grad_norm": 0.31659653782844543, + "learning_rate": 7.228040386503943e-05, + "loss": 1.7479, + "step": 899 + }, + { + "epoch": 0.21733880705143685, + "grad_norm": 0.31071823835372925, + "learning_rate": 7.22621070695333e-05, + "loss": 1.8688, + "step": 900 + }, + { + "epoch": 0.21758029461482734, + "grad_norm": 0.2893757224082947, + "learning_rate": 7.224379093795016e-05, + "loss": 1.6186, + "step": 901 + }, + { + "epoch": 0.21782178217821782, + "grad_norm": 0.2960699498653412, + "learning_rate": 7.22254554812677e-05, + "loss": 1.7204, + "step": 902 + }, + { + "epoch": 0.2180632697416083, + "grad_norm": 0.30816584825515747, + "learning_rate": 7.220710071047515e-05, + "loss": 1.8978, + "step": 903 + }, + { + "epoch": 0.2183047573049988, + "grad_norm": 0.32497209310531616, + "learning_rate": 7.21887266365733e-05, + "loss": 2.0118, + "step": 904 + }, + { + "epoch": 0.21854624486838928, + "grad_norm": 0.29622262716293335, + "learning_rate": 7.217033327057453e-05, + "loss": 1.6086, + "step": 905 + }, + { + "epoch": 0.21878773243177976, + "grad_norm": 0.2999928295612335, + "learning_rate": 7.215192062350279e-05, + "loss": 1.7762, + "step": 906 + }, + { + "epoch": 0.21902921999517025, + "grad_norm": 0.3181908428668976, + "learning_rate": 7.213348870639357e-05, + "loss": 1.7705, + "step": 907 + }, + { + "epoch": 0.21927070755856073, + "grad_norm": 0.3228391110897064, + "learning_rate": 7.211503753029392e-05, + "loss": 1.7532, + "step": 908 + }, + { + "epoch": 0.21951219512195122, + "grad_norm": 0.30232638120651245, + "learning_rate": 7.209656710626243e-05, + "loss": 1.7742, + "step": 909 + }, + { + "epoch": 0.2197536826853417, + "grad_norm": 0.30942896008491516, + "learning_rate": 7.207807744536922e-05, + "loss": 1.6523, + "step": 910 + }, + { + "epoch": 0.2199951702487322, + "grad_norm": 0.31623750925064087, + "learning_rate": 7.205956855869593e-05, + "loss": 1.6995, + "step": 911 + }, + { + "epoch": 0.22023665781212268, + "grad_norm": 0.29854124784469604, + "learning_rate": 7.204104045733576e-05, + "loss": 1.6602, + "step": 912 + }, + { + "epoch": 0.22047814537551316, + "grad_norm": 0.32152605056762695, + "learning_rate": 7.202249315239342e-05, + "loss": 1.857, + "step": 913 + }, + { + "epoch": 0.22071963293890365, + "grad_norm": 0.3507840633392334, + "learning_rate": 7.200392665498505e-05, + "loss": 1.9128, + "step": 914 + }, + { + "epoch": 0.22096112050229413, + "grad_norm": 0.30421963334083557, + "learning_rate": 7.198534097623841e-05, + "loss": 1.867, + "step": 915 + }, + { + "epoch": 0.22120260806568462, + "grad_norm": 0.31841713190078735, + "learning_rate": 7.196673612729268e-05, + "loss": 1.8014, + "step": 916 + }, + { + "epoch": 0.2214440956290751, + "grad_norm": 0.3139771521091461, + "learning_rate": 7.194811211929856e-05, + "loss": 1.8255, + "step": 917 + }, + { + "epoch": 0.2216855831924656, + "grad_norm": 0.31301793456077576, + "learning_rate": 7.19294689634182e-05, + "loss": 1.7579, + "step": 918 + }, + { + "epoch": 0.22192707075585608, + "grad_norm": 0.2959013283252716, + "learning_rate": 7.191080667082529e-05, + "loss": 1.5714, + "step": 919 + }, + { + "epoch": 0.22216855831924656, + "grad_norm": 0.3038369417190552, + "learning_rate": 7.189212525270492e-05, + "loss": 1.6526, + "step": 920 + }, + { + "epoch": 0.22241004588263705, + "grad_norm": 0.31139951944351196, + "learning_rate": 7.187342472025368e-05, + "loss": 1.8009, + "step": 921 + }, + { + "epoch": 0.22265153344602753, + "grad_norm": 0.3210260570049286, + "learning_rate": 7.185470508467963e-05, + "loss": 1.7528, + "step": 922 + }, + { + "epoch": 0.22289302100941802, + "grad_norm": 0.3127114474773407, + "learning_rate": 7.183596635720222e-05, + "loss": 1.8188, + "step": 923 + }, + { + "epoch": 0.2231345085728085, + "grad_norm": 0.29326117038726807, + "learning_rate": 7.18172085490524e-05, + "loss": 1.6762, + "step": 924 + }, + { + "epoch": 0.223375996136199, + "grad_norm": 0.30718374252319336, + "learning_rate": 7.179843167147253e-05, + "loss": 1.7206, + "step": 925 + }, + { + "epoch": 0.22361748369958948, + "grad_norm": 0.3027680218219757, + "learning_rate": 7.177963573571641e-05, + "loss": 1.6067, + "step": 926 + }, + { + "epoch": 0.22385897126297996, + "grad_norm": 0.30007830262184143, + "learning_rate": 7.176082075304924e-05, + "loss": 1.591, + "step": 927 + }, + { + "epoch": 0.22410045882637045, + "grad_norm": 0.31990012526512146, + "learning_rate": 7.17419867347477e-05, + "loss": 1.9032, + "step": 928 + }, + { + "epoch": 0.22434194638976093, + "grad_norm": 0.3229444622993469, + "learning_rate": 7.17231336920998e-05, + "loss": 1.6801, + "step": 929 + }, + { + "epoch": 0.22458343395315142, + "grad_norm": 0.3086046874523163, + "learning_rate": 7.170426163640497e-05, + "loss": 1.827, + "step": 930 + }, + { + "epoch": 0.2248249215165419, + "grad_norm": 0.32034003734588623, + "learning_rate": 7.168537057897407e-05, + "loss": 1.7706, + "step": 931 + }, + { + "epoch": 0.2250664090799324, + "grad_norm": 0.3041267991065979, + "learning_rate": 7.166646053112933e-05, + "loss": 1.771, + "step": 932 + }, + { + "epoch": 0.22530789664332287, + "grad_norm": 0.3302775025367737, + "learning_rate": 7.164753150420436e-05, + "loss": 1.7872, + "step": 933 + }, + { + "epoch": 0.22554938420671336, + "grad_norm": 0.3071431815624237, + "learning_rate": 7.162858350954412e-05, + "loss": 1.7244, + "step": 934 + }, + { + "epoch": 0.22579087177010385, + "grad_norm": 0.30182769894599915, + "learning_rate": 7.160961655850501e-05, + "loss": 1.6328, + "step": 935 + }, + { + "epoch": 0.22603235933349433, + "grad_norm": 0.30967414379119873, + "learning_rate": 7.159063066245471e-05, + "loss": 1.8115, + "step": 936 + }, + { + "epoch": 0.22627384689688482, + "grad_norm": 0.3253704905509949, + "learning_rate": 7.157162583277229e-05, + "loss": 1.7741, + "step": 937 + }, + { + "epoch": 0.2265153344602753, + "grad_norm": 0.30837851762771606, + "learning_rate": 7.155260208084817e-05, + "loss": 1.7762, + "step": 938 + }, + { + "epoch": 0.2267568220236658, + "grad_norm": 0.35172855854034424, + "learning_rate": 7.153355941808413e-05, + "loss": 2.0043, + "step": 939 + }, + { + "epoch": 0.22699830958705627, + "grad_norm": 0.35760238766670227, + "learning_rate": 7.151449785589324e-05, + "loss": 1.7604, + "step": 940 + }, + { + "epoch": 0.22723979715044676, + "grad_norm": 0.340904176235199, + "learning_rate": 7.149541740569991e-05, + "loss": 1.8142, + "step": 941 + }, + { + "epoch": 0.22748128471383725, + "grad_norm": 0.3355856239795685, + "learning_rate": 7.147631807893989e-05, + "loss": 1.8198, + "step": 942 + }, + { + "epoch": 0.22772277227722773, + "grad_norm": 0.3542833626270294, + "learning_rate": 7.145719988706024e-05, + "loss": 1.7095, + "step": 943 + }, + { + "epoch": 0.22796425984061822, + "grad_norm": 0.31368035078048706, + "learning_rate": 7.143806284151933e-05, + "loss": 1.7384, + "step": 944 + }, + { + "epoch": 0.2282057474040087, + "grad_norm": 0.3218083679676056, + "learning_rate": 7.141890695378678e-05, + "loss": 1.6452, + "step": 945 + }, + { + "epoch": 0.2284472349673992, + "grad_norm": 0.3157740533351898, + "learning_rate": 7.139973223534359e-05, + "loss": 1.7696, + "step": 946 + }, + { + "epoch": 0.22868872253078967, + "grad_norm": 0.32926589250564575, + "learning_rate": 7.138053869768196e-05, + "loss": 1.7798, + "step": 947 + }, + { + "epoch": 0.22893021009418016, + "grad_norm": 0.3095945417881012, + "learning_rate": 7.136132635230542e-05, + "loss": 1.8042, + "step": 948 + }, + { + "epoch": 0.22917169765757064, + "grad_norm": 0.30121171474456787, + "learning_rate": 7.134209521072878e-05, + "loss": 1.6287, + "step": 949 + }, + { + "epoch": 0.22941318522096113, + "grad_norm": 0.3294576406478882, + "learning_rate": 7.132284528447808e-05, + "loss": 1.8929, + "step": 950 + }, + { + "epoch": 0.22965467278435162, + "grad_norm": 0.3472389876842499, + "learning_rate": 7.130357658509062e-05, + "loss": 1.824, + "step": 951 + }, + { + "epoch": 0.2298961603477421, + "grad_norm": 0.32449764013290405, + "learning_rate": 7.128428912411498e-05, + "loss": 1.6925, + "step": 952 + }, + { + "epoch": 0.2301376479111326, + "grad_norm": 0.3121519386768341, + "learning_rate": 7.126498291311098e-05, + "loss": 1.7803, + "step": 953 + }, + { + "epoch": 0.23037913547452307, + "grad_norm": 0.3130584955215454, + "learning_rate": 7.124565796364964e-05, + "loss": 1.815, + "step": 954 + }, + { + "epoch": 0.23062062303791356, + "grad_norm": 0.3336242139339447, + "learning_rate": 7.122631428731327e-05, + "loss": 1.8314, + "step": 955 + }, + { + "epoch": 0.23086211060130404, + "grad_norm": 0.32837650179862976, + "learning_rate": 7.120695189569536e-05, + "loss": 1.8304, + "step": 956 + }, + { + "epoch": 0.23110359816469453, + "grad_norm": 0.3215225338935852, + "learning_rate": 7.11875708004006e-05, + "loss": 1.796, + "step": 957 + }, + { + "epoch": 0.23134508572808501, + "grad_norm": 0.3286936283111572, + "learning_rate": 7.116817101304497e-05, + "loss": 1.8722, + "step": 958 + }, + { + "epoch": 0.2315865732914755, + "grad_norm": 0.30622512102127075, + "learning_rate": 7.114875254525557e-05, + "loss": 1.7254, + "step": 959 + }, + { + "epoch": 0.23182806085486599, + "grad_norm": 0.3257673382759094, + "learning_rate": 7.112931540867074e-05, + "loss": 1.7707, + "step": 960 + }, + { + "epoch": 0.23206954841825647, + "grad_norm": 0.3099058270454407, + "learning_rate": 7.110985961494e-05, + "loss": 1.7187, + "step": 961 + }, + { + "epoch": 0.23231103598164696, + "grad_norm": 0.2989310324192047, + "learning_rate": 7.109038517572401e-05, + "loss": 1.7216, + "step": 962 + }, + { + "epoch": 0.23255252354503744, + "grad_norm": 0.2901287376880646, + "learning_rate": 7.107089210269472e-05, + "loss": 1.5476, + "step": 963 + }, + { + "epoch": 0.23279401110842793, + "grad_norm": 0.31841766834259033, + "learning_rate": 7.10513804075351e-05, + "loss": 1.7271, + "step": 964 + }, + { + "epoch": 0.23303549867181841, + "grad_norm": 0.3112894892692566, + "learning_rate": 7.103185010193938e-05, + "loss": 1.8632, + "step": 965 + }, + { + "epoch": 0.2332769862352089, + "grad_norm": 0.29903125762939453, + "learning_rate": 7.101230119761294e-05, + "loss": 1.5865, + "step": 966 + }, + { + "epoch": 0.23351847379859939, + "grad_norm": 0.34164854884147644, + "learning_rate": 7.099273370627225e-05, + "loss": 1.8468, + "step": 967 + }, + { + "epoch": 0.23375996136198987, + "grad_norm": 0.3156038522720337, + "learning_rate": 7.097314763964496e-05, + "loss": 1.8972, + "step": 968 + }, + { + "epoch": 0.23400144892538036, + "grad_norm": 0.3213566541671753, + "learning_rate": 7.095354300946988e-05, + "loss": 1.7789, + "step": 969 + }, + { + "epoch": 0.23424293648877084, + "grad_norm": 0.31230372190475464, + "learning_rate": 7.093391982749686e-05, + "loss": 1.8018, + "step": 970 + }, + { + "epoch": 0.2344844240521613, + "grad_norm": 0.2937510907649994, + "learning_rate": 7.091427810548698e-05, + "loss": 1.6656, + "step": 971 + }, + { + "epoch": 0.23472591161555179, + "grad_norm": 0.3283037841320038, + "learning_rate": 7.089461785521232e-05, + "loss": 1.9303, + "step": 972 + }, + { + "epoch": 0.23496739917894227, + "grad_norm": 0.29008540511131287, + "learning_rate": 7.087493908845617e-05, + "loss": 1.6371, + "step": 973 + }, + { + "epoch": 0.23520888674233276, + "grad_norm": 0.29574844241142273, + "learning_rate": 7.085524181701281e-05, + "loss": 1.6921, + "step": 974 + }, + { + "epoch": 0.23545037430572324, + "grad_norm": 0.30947405099868774, + "learning_rate": 7.083552605268772e-05, + "loss": 1.7036, + "step": 975 + }, + { + "epoch": 0.23569186186911373, + "grad_norm": 0.30380678176879883, + "learning_rate": 7.081579180729739e-05, + "loss": 1.7498, + "step": 976 + }, + { + "epoch": 0.2359333494325042, + "grad_norm": 0.30693385004997253, + "learning_rate": 7.079603909266939e-05, + "loss": 1.6627, + "step": 977 + }, + { + "epoch": 0.2361748369958947, + "grad_norm": 0.31431153416633606, + "learning_rate": 7.07762679206424e-05, + "loss": 1.7361, + "step": 978 + }, + { + "epoch": 0.23641632455928518, + "grad_norm": 0.3664765954017639, + "learning_rate": 7.075647830306614e-05, + "loss": 2.0544, + "step": 979 + }, + { + "epoch": 0.23665781212267567, + "grad_norm": 0.29501873254776, + "learning_rate": 7.073667025180136e-05, + "loss": 1.6702, + "step": 980 + }, + { + "epoch": 0.23689929968606616, + "grad_norm": 0.3374174237251282, + "learning_rate": 7.07168437787199e-05, + "loss": 1.7518, + "step": 981 + }, + { + "epoch": 0.23714078724945664, + "grad_norm": 0.30214452743530273, + "learning_rate": 7.069699889570464e-05, + "loss": 1.7077, + "step": 982 + }, + { + "epoch": 0.23738227481284713, + "grad_norm": 0.32760128378868103, + "learning_rate": 7.067713561464943e-05, + "loss": 1.7956, + "step": 983 + }, + { + "epoch": 0.2376237623762376, + "grad_norm": 0.32891011238098145, + "learning_rate": 7.065725394745925e-05, + "loss": 1.7961, + "step": 984 + }, + { + "epoch": 0.2378652499396281, + "grad_norm": 0.316244900226593, + "learning_rate": 7.063735390605001e-05, + "loss": 1.7438, + "step": 985 + }, + { + "epoch": 0.23810673750301858, + "grad_norm": 0.3115881085395813, + "learning_rate": 7.061743550234867e-05, + "loss": 1.7114, + "step": 986 + }, + { + "epoch": 0.23834822506640907, + "grad_norm": 0.30240875482559204, + "learning_rate": 7.05974987482932e-05, + "loss": 1.9043, + "step": 987 + }, + { + "epoch": 0.23858971262979956, + "grad_norm": 0.29856961965560913, + "learning_rate": 7.057754365583252e-05, + "loss": 1.6706, + "step": 988 + }, + { + "epoch": 0.23883120019319004, + "grad_norm": 0.3027576804161072, + "learning_rate": 7.055757023692664e-05, + "loss": 1.6477, + "step": 989 + }, + { + "epoch": 0.23907268775658053, + "grad_norm": 0.29951512813568115, + "learning_rate": 7.053757850354646e-05, + "loss": 1.7039, + "step": 990 + }, + { + "epoch": 0.239314175319971, + "grad_norm": 0.3174339532852173, + "learning_rate": 7.051756846767392e-05, + "loss": 1.7394, + "step": 991 + }, + { + "epoch": 0.2395556628833615, + "grad_norm": 0.36309531331062317, + "learning_rate": 7.049754014130186e-05, + "loss": 1.7498, + "step": 992 + }, + { + "epoch": 0.23979715044675198, + "grad_norm": 0.32905271649360657, + "learning_rate": 7.047749353643416e-05, + "loss": 1.93, + "step": 993 + }, + { + "epoch": 0.24003863801014247, + "grad_norm": 0.33373570442199707, + "learning_rate": 7.045742866508557e-05, + "loss": 1.9002, + "step": 994 + }, + { + "epoch": 0.24028012557353295, + "grad_norm": 0.29678893089294434, + "learning_rate": 7.043734553928188e-05, + "loss": 1.7453, + "step": 995 + }, + { + "epoch": 0.24052161313692344, + "grad_norm": 0.3152943253517151, + "learning_rate": 7.041724417105977e-05, + "loss": 1.9029, + "step": 996 + }, + { + "epoch": 0.24076310070031393, + "grad_norm": 0.3238842785358429, + "learning_rate": 7.039712457246685e-05, + "loss": 1.8002, + "step": 997 + }, + { + "epoch": 0.2410045882637044, + "grad_norm": 0.3086811304092407, + "learning_rate": 7.037698675556167e-05, + "loss": 1.7737, + "step": 998 + }, + { + "epoch": 0.2412460758270949, + "grad_norm": 0.3081672191619873, + "learning_rate": 7.03568307324137e-05, + "loss": 1.7324, + "step": 999 + }, + { + "epoch": 0.24148756339048538, + "grad_norm": 0.31431376934051514, + "learning_rate": 7.03366565151033e-05, + "loss": 1.9154, + "step": 1000 + }, + { + "epoch": 0.24172905095387587, + "grad_norm": 0.3176340162754059, + "learning_rate": 7.031646411572175e-05, + "loss": 1.8225, + "step": 1001 + }, + { + "epoch": 0.24197053851726635, + "grad_norm": 0.30996355414390564, + "learning_rate": 7.029625354637126e-05, + "loss": 1.7401, + "step": 1002 + }, + { + "epoch": 0.24221202608065684, + "grad_norm": 0.30828937888145447, + "learning_rate": 7.027602481916487e-05, + "loss": 1.7273, + "step": 1003 + }, + { + "epoch": 0.24245351364404732, + "grad_norm": 0.31076958775520325, + "learning_rate": 7.025577794622655e-05, + "loss": 1.7303, + "step": 1004 + }, + { + "epoch": 0.2426950012074378, + "grad_norm": 0.29473546147346497, + "learning_rate": 7.023551293969111e-05, + "loss": 1.6771, + "step": 1005 + }, + { + "epoch": 0.2429364887708283, + "grad_norm": 0.3021571636199951, + "learning_rate": 7.021522981170426e-05, + "loss": 1.6781, + "step": 1006 + }, + { + "epoch": 0.24317797633421878, + "grad_norm": 0.3115958273410797, + "learning_rate": 7.019492857442254e-05, + "loss": 1.7734, + "step": 1007 + }, + { + "epoch": 0.24341946389760927, + "grad_norm": 0.31508898735046387, + "learning_rate": 7.017460924001337e-05, + "loss": 1.8933, + "step": 1008 + }, + { + "epoch": 0.24366095146099975, + "grad_norm": 0.30906936526298523, + "learning_rate": 7.015427182065502e-05, + "loss": 1.7643, + "step": 1009 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 0.3254733681678772, + "learning_rate": 7.013391632853658e-05, + "loss": 1.877, + "step": 1010 + }, + { + "epoch": 0.24414392658778072, + "grad_norm": 0.30368247628211975, + "learning_rate": 7.011354277585796e-05, + "loss": 1.7064, + "step": 1011 + }, + { + "epoch": 0.2443854141511712, + "grad_norm": 0.312253475189209, + "learning_rate": 7.009315117482992e-05, + "loss": 1.7001, + "step": 1012 + }, + { + "epoch": 0.2446269017145617, + "grad_norm": 0.3097797930240631, + "learning_rate": 7.007274153767401e-05, + "loss": 1.7155, + "step": 1013 + }, + { + "epoch": 0.24486838927795218, + "grad_norm": 0.32299181818962097, + "learning_rate": 7.005231387662266e-05, + "loss": 1.75, + "step": 1014 + }, + { + "epoch": 0.24510987684134267, + "grad_norm": 0.3350425064563751, + "learning_rate": 7.003186820391902e-05, + "loss": 1.9598, + "step": 1015 + }, + { + "epoch": 0.24535136440473315, + "grad_norm": 0.31722837686538696, + "learning_rate": 7.001140453181705e-05, + "loss": 1.7972, + "step": 1016 + }, + { + "epoch": 0.24559285196812364, + "grad_norm": 0.33487066626548767, + "learning_rate": 6.999092287258155e-05, + "loss": 1.7209, + "step": 1017 + }, + { + "epoch": 0.24583433953151412, + "grad_norm": 0.299215167760849, + "learning_rate": 6.997042323848803e-05, + "loss": 1.7777, + "step": 1018 + }, + { + "epoch": 0.2460758270949046, + "grad_norm": 0.31593263149261475, + "learning_rate": 6.994990564182284e-05, + "loss": 1.9017, + "step": 1019 + }, + { + "epoch": 0.2463173146582951, + "grad_norm": 0.33589228987693787, + "learning_rate": 6.992937009488303e-05, + "loss": 1.7828, + "step": 1020 + }, + { + "epoch": 0.24655880222168558, + "grad_norm": 0.31036049127578735, + "learning_rate": 6.990881660997647e-05, + "loss": 1.6732, + "step": 1021 + }, + { + "epoch": 0.24680028978507607, + "grad_norm": 0.31886163353919983, + "learning_rate": 6.988824519942174e-05, + "loss": 1.745, + "step": 1022 + }, + { + "epoch": 0.24704177734846655, + "grad_norm": 0.3151240944862366, + "learning_rate": 6.986765587554818e-05, + "loss": 1.6845, + "step": 1023 + }, + { + "epoch": 0.24728326491185704, + "grad_norm": 0.33134594559669495, + "learning_rate": 6.984704865069587e-05, + "loss": 1.8795, + "step": 1024 + }, + { + "epoch": 0.24752475247524752, + "grad_norm": 0.3216524124145508, + "learning_rate": 6.98264235372156e-05, + "loss": 1.8034, + "step": 1025 + }, + { + "epoch": 0.247766240038638, + "grad_norm": 0.3143945336341858, + "learning_rate": 6.98057805474689e-05, + "loss": 1.7907, + "step": 1026 + }, + { + "epoch": 0.2480077276020285, + "grad_norm": 0.2938880920410156, + "learning_rate": 6.978511969382799e-05, + "loss": 1.6928, + "step": 1027 + }, + { + "epoch": 0.24824921516541898, + "grad_norm": 0.3236706852912903, + "learning_rate": 6.976444098867584e-05, + "loss": 1.8266, + "step": 1028 + }, + { + "epoch": 0.24849070272880946, + "grad_norm": 0.3211507499217987, + "learning_rate": 6.974374444440608e-05, + "loss": 1.6146, + "step": 1029 + }, + { + "epoch": 0.24873219029219995, + "grad_norm": 0.32520592212677, + "learning_rate": 6.972303007342304e-05, + "loss": 1.8695, + "step": 1030 + }, + { + "epoch": 0.24897367785559044, + "grad_norm": 0.3441825211048126, + "learning_rate": 6.970229788814176e-05, + "loss": 1.8257, + "step": 1031 + }, + { + "epoch": 0.24921516541898092, + "grad_norm": 0.31615981459617615, + "learning_rate": 6.968154790098791e-05, + "loss": 1.742, + "step": 1032 + }, + { + "epoch": 0.2494566529823714, + "grad_norm": 0.32662391662597656, + "learning_rate": 6.966078012439787e-05, + "loss": 1.7395, + "step": 1033 + }, + { + "epoch": 0.2496981405457619, + "grad_norm": 0.3284902572631836, + "learning_rate": 6.963999457081865e-05, + "loss": 1.9117, + "step": 1034 + }, + { + "epoch": 0.24993962810915238, + "grad_norm": 0.3873670697212219, + "learning_rate": 6.961919125270795e-05, + "loss": 1.9818, + "step": 1035 + }, + { + "epoch": 0.25018111567254286, + "grad_norm": 0.30828049778938293, + "learning_rate": 6.95983701825341e-05, + "loss": 1.7419, + "step": 1036 + }, + { + "epoch": 0.25042260323593335, + "grad_norm": 0.33050593733787537, + "learning_rate": 6.957753137277606e-05, + "loss": 1.8804, + "step": 1037 + }, + { + "epoch": 0.25066409079932384, + "grad_norm": 0.32716748118400574, + "learning_rate": 6.955667483592344e-05, + "loss": 1.7466, + "step": 1038 + }, + { + "epoch": 0.2509055783627143, + "grad_norm": 0.31101343035697937, + "learning_rate": 6.953580058447644e-05, + "loss": 1.6372, + "step": 1039 + }, + { + "epoch": 0.2511470659261048, + "grad_norm": 0.3111160397529602, + "learning_rate": 6.951490863094593e-05, + "loss": 1.7179, + "step": 1040 + }, + { + "epoch": 0.2513885534894953, + "grad_norm": 0.29902443289756775, + "learning_rate": 6.949399898785336e-05, + "loss": 1.6466, + "step": 1041 + }, + { + "epoch": 0.2516300410528858, + "grad_norm": 0.3357815146446228, + "learning_rate": 6.947307166773077e-05, + "loss": 1.8709, + "step": 1042 + }, + { + "epoch": 0.25187152861627626, + "grad_norm": 0.31973618268966675, + "learning_rate": 6.945212668312082e-05, + "loss": 1.6442, + "step": 1043 + }, + { + "epoch": 0.25211301617966675, + "grad_norm": 0.31211501359939575, + "learning_rate": 6.943116404657673e-05, + "loss": 1.6384, + "step": 1044 + }, + { + "epoch": 0.25235450374305723, + "grad_norm": 0.31134694814682007, + "learning_rate": 6.941018377066233e-05, + "loss": 1.7111, + "step": 1045 + }, + { + "epoch": 0.2525959913064477, + "grad_norm": 0.33072417974472046, + "learning_rate": 6.9389185867952e-05, + "loss": 1.795, + "step": 1046 + }, + { + "epoch": 0.2528374788698382, + "grad_norm": 0.31047114729881287, + "learning_rate": 6.93681703510307e-05, + "loss": 1.7906, + "step": 1047 + }, + { + "epoch": 0.2530789664332287, + "grad_norm": 0.3172812759876251, + "learning_rate": 6.934713723249394e-05, + "loss": 1.7707, + "step": 1048 + }, + { + "epoch": 0.2533204539966192, + "grad_norm": 0.3336034417152405, + "learning_rate": 6.932608652494775e-05, + "loss": 1.8711, + "step": 1049 + }, + { + "epoch": 0.25356194156000966, + "grad_norm": 0.30605006217956543, + "learning_rate": 6.930501824100876e-05, + "loss": 1.7119, + "step": 1050 + }, + { + "epoch": 0.25380342912340015, + "grad_norm": 0.31676533818244934, + "learning_rate": 6.92839323933041e-05, + "loss": 1.7777, + "step": 1051 + }, + { + "epoch": 0.25404491668679063, + "grad_norm": 0.3026861548423767, + "learning_rate": 6.926282899447145e-05, + "loss": 1.5476, + "step": 1052 + }, + { + "epoch": 0.2542864042501811, + "grad_norm": 0.3109389841556549, + "learning_rate": 6.924170805715894e-05, + "loss": 1.6907, + "step": 1053 + }, + { + "epoch": 0.2545278918135716, + "grad_norm": 0.31733840703964233, + "learning_rate": 6.922056959402528e-05, + "loss": 1.8424, + "step": 1054 + }, + { + "epoch": 0.2547693793769621, + "grad_norm": 0.3075104355812073, + "learning_rate": 6.919941361773971e-05, + "loss": 1.7506, + "step": 1055 + }, + { + "epoch": 0.2550108669403526, + "grad_norm": 0.30655089020729065, + "learning_rate": 6.917824014098187e-05, + "loss": 1.7237, + "step": 1056 + }, + { + "epoch": 0.25525235450374306, + "grad_norm": 0.3038625419139862, + "learning_rate": 6.915704917644196e-05, + "loss": 1.7619, + "step": 1057 + }, + { + "epoch": 0.25549384206713355, + "grad_norm": 0.3069184720516205, + "learning_rate": 6.913584073682062e-05, + "loss": 1.7937, + "step": 1058 + }, + { + "epoch": 0.25573532963052403, + "grad_norm": 0.3129485845565796, + "learning_rate": 6.911461483482903e-05, + "loss": 1.7742, + "step": 1059 + }, + { + "epoch": 0.2559768171939145, + "grad_norm": 0.3241616487503052, + "learning_rate": 6.909337148318877e-05, + "loss": 1.8027, + "step": 1060 + }, + { + "epoch": 0.256218304757305, + "grad_norm": 0.31016314029693604, + "learning_rate": 6.907211069463189e-05, + "loss": 1.6762, + "step": 1061 + }, + { + "epoch": 0.2564597923206955, + "grad_norm": 0.31560423970222473, + "learning_rate": 6.90508324819009e-05, + "loss": 1.7494, + "step": 1062 + }, + { + "epoch": 0.256701279884086, + "grad_norm": 0.3282552659511566, + "learning_rate": 6.902953685774877e-05, + "loss": 1.7234, + "step": 1063 + }, + { + "epoch": 0.25694276744747646, + "grad_norm": 0.2940976321697235, + "learning_rate": 6.900822383493888e-05, + "loss": 1.6625, + "step": 1064 + }, + { + "epoch": 0.25718425501086695, + "grad_norm": 0.3287374973297119, + "learning_rate": 6.898689342624505e-05, + "loss": 1.7004, + "step": 1065 + }, + { + "epoch": 0.25742574257425743, + "grad_norm": 0.2989017367362976, + "learning_rate": 6.896554564445151e-05, + "loss": 1.7968, + "step": 1066 + }, + { + "epoch": 0.2576672301376479, + "grad_norm": 0.3247399628162384, + "learning_rate": 6.894418050235291e-05, + "loss": 1.8534, + "step": 1067 + }, + { + "epoch": 0.2579087177010384, + "grad_norm": 0.31024983525276184, + "learning_rate": 6.892279801275434e-05, + "loss": 1.7237, + "step": 1068 + }, + { + "epoch": 0.2581502052644289, + "grad_norm": 0.30201882123947144, + "learning_rate": 6.890139818847119e-05, + "loss": 1.6479, + "step": 1069 + }, + { + "epoch": 0.2583916928278194, + "grad_norm": 0.30573418736457825, + "learning_rate": 6.887998104232934e-05, + "loss": 1.7856, + "step": 1070 + }, + { + "epoch": 0.25863318039120986, + "grad_norm": 0.32034432888031006, + "learning_rate": 6.885854658716501e-05, + "loss": 1.8321, + "step": 1071 + }, + { + "epoch": 0.25887466795460035, + "grad_norm": 0.30810311436653137, + "learning_rate": 6.883709483582479e-05, + "loss": 1.761, + "step": 1072 + }, + { + "epoch": 0.25911615551799083, + "grad_norm": 0.3177671730518341, + "learning_rate": 6.881562580116563e-05, + "loss": 1.9071, + "step": 1073 + }, + { + "epoch": 0.2593576430813813, + "grad_norm": 0.3227800130844116, + "learning_rate": 6.879413949605488e-05, + "loss": 1.8626, + "step": 1074 + }, + { + "epoch": 0.2595991306447718, + "grad_norm": 0.31385767459869385, + "learning_rate": 6.877263593337018e-05, + "loss": 1.7978, + "step": 1075 + }, + { + "epoch": 0.2598406182081623, + "grad_norm": 0.3195452094078064, + "learning_rate": 6.875111512599959e-05, + "loss": 1.7311, + "step": 1076 + }, + { + "epoch": 0.2600821057715528, + "grad_norm": 0.30203190445899963, + "learning_rate": 6.87295770868414e-05, + "loss": 1.8604, + "step": 1077 + }, + { + "epoch": 0.26032359333494326, + "grad_norm": 0.31031322479248047, + "learning_rate": 6.870802182880436e-05, + "loss": 1.7341, + "step": 1078 + }, + { + "epoch": 0.26056508089833375, + "grad_norm": 0.3019157946109772, + "learning_rate": 6.868644936480741e-05, + "loss": 1.7871, + "step": 1079 + }, + { + "epoch": 0.26080656846172423, + "grad_norm": 0.30810457468032837, + "learning_rate": 6.866485970777988e-05, + "loss": 1.6875, + "step": 1080 + }, + { + "epoch": 0.2610480560251147, + "grad_norm": 0.2972477078437805, + "learning_rate": 6.864325287066141e-05, + "loss": 1.7081, + "step": 1081 + }, + { + "epoch": 0.2612895435885052, + "grad_norm": 0.30459290742874146, + "learning_rate": 6.862162886640187e-05, + "loss": 1.7533, + "step": 1082 + }, + { + "epoch": 0.2615310311518957, + "grad_norm": 0.31380796432495117, + "learning_rate": 6.85999877079615e-05, + "loss": 1.7714, + "step": 1083 + }, + { + "epoch": 0.2617725187152862, + "grad_norm": 0.33258458971977234, + "learning_rate": 6.857832940831076e-05, + "loss": 1.7071, + "step": 1084 + }, + { + "epoch": 0.26201400627867666, + "grad_norm": 0.3180256485939026, + "learning_rate": 6.855665398043041e-05, + "loss": 1.7715, + "step": 1085 + }, + { + "epoch": 0.26225549384206714, + "grad_norm": 0.30172500014305115, + "learning_rate": 6.853496143731148e-05, + "loss": 1.66, + "step": 1086 + }, + { + "epoch": 0.26249698140545763, + "grad_norm": 0.3087107837200165, + "learning_rate": 6.851325179195525e-05, + "loss": 1.8475, + "step": 1087 + }, + { + "epoch": 0.2627384689688481, + "grad_norm": 0.30850815773010254, + "learning_rate": 6.849152505737324e-05, + "loss": 1.6628, + "step": 1088 + }, + { + "epoch": 0.2629799565322386, + "grad_norm": 0.32561859488487244, + "learning_rate": 6.846978124658721e-05, + "loss": 1.8223, + "step": 1089 + }, + { + "epoch": 0.2632214440956291, + "grad_norm": 0.31419193744659424, + "learning_rate": 6.84480203726292e-05, + "loss": 1.7969, + "step": 1090 + }, + { + "epoch": 0.2634629316590196, + "grad_norm": 0.2968290448188782, + "learning_rate": 6.842624244854143e-05, + "loss": 1.6429, + "step": 1091 + }, + { + "epoch": 0.26370441922241006, + "grad_norm": 0.29795342683792114, + "learning_rate": 6.840444748737634e-05, + "loss": 1.6196, + "step": 1092 + }, + { + "epoch": 0.26394590678580054, + "grad_norm": 0.3083789646625519, + "learning_rate": 6.838263550219661e-05, + "loss": 1.6487, + "step": 1093 + }, + { + "epoch": 0.26418739434919103, + "grad_norm": 0.3154310882091522, + "learning_rate": 6.83608065060751e-05, + "loss": 1.7913, + "step": 1094 + }, + { + "epoch": 0.2644288819125815, + "grad_norm": 0.31129124760627747, + "learning_rate": 6.833896051209488e-05, + "loss": 1.8084, + "step": 1095 + }, + { + "epoch": 0.264670369475972, + "grad_norm": 0.3194768726825714, + "learning_rate": 6.831709753334917e-05, + "loss": 1.7722, + "step": 1096 + }, + { + "epoch": 0.2649118570393625, + "grad_norm": 0.30591970682144165, + "learning_rate": 6.829521758294145e-05, + "loss": 1.6323, + "step": 1097 + }, + { + "epoch": 0.26515334460275297, + "grad_norm": 0.34531369805336, + "learning_rate": 6.827332067398527e-05, + "loss": 1.9616, + "step": 1098 + }, + { + "epoch": 0.26539483216614346, + "grad_norm": 0.3163597881793976, + "learning_rate": 6.825140681960442e-05, + "loss": 1.8197, + "step": 1099 + }, + { + "epoch": 0.26563631972953394, + "grad_norm": 0.2946029603481293, + "learning_rate": 6.822947603293281e-05, + "loss": 1.5898, + "step": 1100 + }, + { + "epoch": 0.26587780729292443, + "grad_norm": 0.30496641993522644, + "learning_rate": 6.820752832711453e-05, + "loss": 1.5909, + "step": 1101 + }, + { + "epoch": 0.2661192948563149, + "grad_norm": 0.33542415499687195, + "learning_rate": 6.818556371530378e-05, + "loss": 1.781, + "step": 1102 + }, + { + "epoch": 0.2663607824197054, + "grad_norm": 0.29307273030281067, + "learning_rate": 6.81635822106649e-05, + "loss": 1.6314, + "step": 1103 + }, + { + "epoch": 0.2666022699830959, + "grad_norm": 0.3387092053890228, + "learning_rate": 6.814158382637235e-05, + "loss": 1.8468, + "step": 1104 + }, + { + "epoch": 0.26684375754648637, + "grad_norm": 0.31816431879997253, + "learning_rate": 6.811956857561074e-05, + "loss": 1.6737, + "step": 1105 + }, + { + "epoch": 0.26708524510987686, + "grad_norm": 0.316036581993103, + "learning_rate": 6.809753647157472e-05, + "loss": 1.7484, + "step": 1106 + }, + { + "epoch": 0.26732673267326734, + "grad_norm": 0.33369365334510803, + "learning_rate": 6.807548752746911e-05, + "loss": 1.8675, + "step": 1107 + }, + { + "epoch": 0.2675682202366578, + "grad_norm": 0.31673648953437805, + "learning_rate": 6.805342175650881e-05, + "loss": 1.7556, + "step": 1108 + }, + { + "epoch": 0.2678097078000483, + "grad_norm": 0.3155718147754669, + "learning_rate": 6.803133917191878e-05, + "loss": 1.6849, + "step": 1109 + }, + { + "epoch": 0.2680511953634388, + "grad_norm": 0.31575703620910645, + "learning_rate": 6.800923978693403e-05, + "loss": 1.7253, + "step": 1110 + }, + { + "epoch": 0.2682926829268293, + "grad_norm": 0.2937328815460205, + "learning_rate": 6.798712361479974e-05, + "loss": 1.6773, + "step": 1111 + }, + { + "epoch": 0.26853417049021977, + "grad_norm": 0.30649858713150024, + "learning_rate": 6.796499066877106e-05, + "loss": 1.7406, + "step": 1112 + }, + { + "epoch": 0.26877565805361026, + "grad_norm": 0.34489336609840393, + "learning_rate": 6.79428409621132e-05, + "loss": 1.6196, + "step": 1113 + }, + { + "epoch": 0.26901714561700074, + "grad_norm": 0.29834282398223877, + "learning_rate": 6.792067450810149e-05, + "loss": 1.6866, + "step": 1114 + }, + { + "epoch": 0.2692586331803912, + "grad_norm": 0.3076286315917969, + "learning_rate": 6.78984913200212e-05, + "loss": 1.8003, + "step": 1115 + }, + { + "epoch": 0.2695001207437817, + "grad_norm": 0.3159911632537842, + "learning_rate": 6.78762914111677e-05, + "loss": 1.766, + "step": 1116 + }, + { + "epoch": 0.2697416083071722, + "grad_norm": 0.3220473527908325, + "learning_rate": 6.785407479484633e-05, + "loss": 1.8153, + "step": 1117 + }, + { + "epoch": 0.2699830958705627, + "grad_norm": 0.31992602348327637, + "learning_rate": 6.78318414843725e-05, + "loss": 1.9029, + "step": 1118 + }, + { + "epoch": 0.27022458343395317, + "grad_norm": 0.31522971391677856, + "learning_rate": 6.780959149307156e-05, + "loss": 1.7615, + "step": 1119 + }, + { + "epoch": 0.27046607099734365, + "grad_norm": 0.3177500069141388, + "learning_rate": 6.778732483427895e-05, + "loss": 1.8575, + "step": 1120 + }, + { + "epoch": 0.27070755856073414, + "grad_norm": 0.3230820596218109, + "learning_rate": 6.776504152134e-05, + "loss": 1.5958, + "step": 1121 + }, + { + "epoch": 0.2709490461241246, + "grad_norm": 0.30711400508880615, + "learning_rate": 6.774274156761004e-05, + "loss": 1.6727, + "step": 1122 + }, + { + "epoch": 0.2711905336875151, + "grad_norm": 0.3092270791530609, + "learning_rate": 6.772042498645446e-05, + "loss": 1.7553, + "step": 1123 + }, + { + "epoch": 0.2714320212509056, + "grad_norm": 0.3213646411895752, + "learning_rate": 6.769809179124851e-05, + "loss": 1.7792, + "step": 1124 + }, + { + "epoch": 0.2716735088142961, + "grad_norm": 0.3240359127521515, + "learning_rate": 6.767574199537744e-05, + "loss": 1.6274, + "step": 1125 + }, + { + "epoch": 0.27191499637768657, + "grad_norm": 0.2978704869747162, + "learning_rate": 6.765337561223647e-05, + "loss": 1.6713, + "step": 1126 + }, + { + "epoch": 0.27215648394107705, + "grad_norm": 0.3193483054637909, + "learning_rate": 6.763099265523073e-05, + "loss": 1.8093, + "step": 1127 + }, + { + "epoch": 0.27239797150446754, + "grad_norm": 0.31814515590667725, + "learning_rate": 6.760859313777531e-05, + "loss": 1.8676, + "step": 1128 + }, + { + "epoch": 0.272639459067858, + "grad_norm": 0.4061098098754883, + "learning_rate": 6.758617707329517e-05, + "loss": 1.9431, + "step": 1129 + }, + { + "epoch": 0.2728809466312485, + "grad_norm": 0.2891682982444763, + "learning_rate": 6.756374447522527e-05, + "loss": 1.6673, + "step": 1130 + }, + { + "epoch": 0.273122434194639, + "grad_norm": 0.33421790599823, + "learning_rate": 6.754129535701044e-05, + "loss": 1.8145, + "step": 1131 + }, + { + "epoch": 0.2733639217580295, + "grad_norm": 0.3227134346961975, + "learning_rate": 6.751882973210537e-05, + "loss": 1.7578, + "step": 1132 + }, + { + "epoch": 0.27360540932141997, + "grad_norm": 0.31454476714134216, + "learning_rate": 6.74963476139747e-05, + "loss": 1.7671, + "step": 1133 + }, + { + "epoch": 0.27384689688481045, + "grad_norm": 0.317026287317276, + "learning_rate": 6.747384901609294e-05, + "loss": 1.7762, + "step": 1134 + }, + { + "epoch": 0.27408838444820094, + "grad_norm": 0.2962716817855835, + "learning_rate": 6.745133395194447e-05, + "loss": 1.5824, + "step": 1135 + }, + { + "epoch": 0.2743298720115914, + "grad_norm": 0.30856993794441223, + "learning_rate": 6.742880243502354e-05, + "loss": 1.7647, + "step": 1136 + }, + { + "epoch": 0.2745713595749819, + "grad_norm": 0.319867879152298, + "learning_rate": 6.740625447883428e-05, + "loss": 1.6957, + "step": 1137 + }, + { + "epoch": 0.2748128471383724, + "grad_norm": 0.3080596625804901, + "learning_rate": 6.738369009689064e-05, + "loss": 1.7083, + "step": 1138 + }, + { + "epoch": 0.2750543347017629, + "grad_norm": 0.3023444712162018, + "learning_rate": 6.736110930271642e-05, + "loss": 1.6312, + "step": 1139 + }, + { + "epoch": 0.27529582226515337, + "grad_norm": 0.2923286259174347, + "learning_rate": 6.733851210984529e-05, + "loss": 1.6025, + "step": 1140 + }, + { + "epoch": 0.27553730982854385, + "grad_norm": 0.32204321026802063, + "learning_rate": 6.731589853182071e-05, + "loss": 1.6971, + "step": 1141 + }, + { + "epoch": 0.27577879739193434, + "grad_norm": 0.3155701160430908, + "learning_rate": 6.729326858219599e-05, + "loss": 1.801, + "step": 1142 + }, + { + "epoch": 0.2760202849553248, + "grad_norm": 0.30704465508461, + "learning_rate": 6.727062227453423e-05, + "loss": 1.8037, + "step": 1143 + }, + { + "epoch": 0.2762617725187153, + "grad_norm": 0.3244346082210541, + "learning_rate": 6.724795962240834e-05, + "loss": 1.7041, + "step": 1144 + }, + { + "epoch": 0.2765032600821058, + "grad_norm": 0.3154827952384949, + "learning_rate": 6.722528063940102e-05, + "loss": 1.8805, + "step": 1145 + }, + { + "epoch": 0.2767447476454963, + "grad_norm": 0.30870237946510315, + "learning_rate": 6.720258533910478e-05, + "loss": 1.6691, + "step": 1146 + }, + { + "epoch": 0.27698623520888677, + "grad_norm": 0.31408193707466125, + "learning_rate": 6.71798737351219e-05, + "loss": 1.5256, + "step": 1147 + }, + { + "epoch": 0.27722772277227725, + "grad_norm": 0.30898284912109375, + "learning_rate": 6.71571458410644e-05, + "loss": 1.7632, + "step": 1148 + }, + { + "epoch": 0.27746921033566774, + "grad_norm": 0.33258867263793945, + "learning_rate": 6.713440167055414e-05, + "loss": 1.9591, + "step": 1149 + }, + { + "epoch": 0.2777106978990582, + "grad_norm": 0.3209589421749115, + "learning_rate": 6.711164123722264e-05, + "loss": 1.8063, + "step": 1150 + }, + { + "epoch": 0.2779521854624487, + "grad_norm": 0.2860225439071655, + "learning_rate": 6.708886455471122e-05, + "loss": 1.6478, + "step": 1151 + }, + { + "epoch": 0.2781936730258392, + "grad_norm": 0.3342827558517456, + "learning_rate": 6.706607163667094e-05, + "loss": 1.7519, + "step": 1152 + }, + { + "epoch": 0.2784351605892297, + "grad_norm": 0.3273380398750305, + "learning_rate": 6.704326249676261e-05, + "loss": 1.7519, + "step": 1153 + }, + { + "epoch": 0.27867664815262017, + "grad_norm": 0.30446043610572815, + "learning_rate": 6.702043714865668e-05, + "loss": 1.8829, + "step": 1154 + }, + { + "epoch": 0.27891813571601065, + "grad_norm": 0.327215313911438, + "learning_rate": 6.69975956060334e-05, + "loss": 1.8613, + "step": 1155 + }, + { + "epoch": 0.27915962327940114, + "grad_norm": 0.3009372353553772, + "learning_rate": 6.697473788258269e-05, + "loss": 1.7337, + "step": 1156 + }, + { + "epoch": 0.2794011108427916, + "grad_norm": 0.2950308620929718, + "learning_rate": 6.695186399200416e-05, + "loss": 1.6854, + "step": 1157 + }, + { + "epoch": 0.2796425984061821, + "grad_norm": 0.3036794364452362, + "learning_rate": 6.692897394800716e-05, + "loss": 1.5876, + "step": 1158 + }, + { + "epoch": 0.2798840859695726, + "grad_norm": 0.34749823808670044, + "learning_rate": 6.690606776431066e-05, + "loss": 1.8013, + "step": 1159 + }, + { + "epoch": 0.2801255735329631, + "grad_norm": 0.31820452213287354, + "learning_rate": 6.688314545464331e-05, + "loss": 1.9421, + "step": 1160 + }, + { + "epoch": 0.28036706109635356, + "grad_norm": 0.318487286567688, + "learning_rate": 6.686020703274347e-05, + "loss": 1.7597, + "step": 1161 + }, + { + "epoch": 0.28060854865974405, + "grad_norm": 0.3037980794906616, + "learning_rate": 6.683725251235911e-05, + "loss": 1.7721, + "step": 1162 + }, + { + "epoch": 0.28085003622313454, + "grad_norm": 0.3123769760131836, + "learning_rate": 6.681428190724789e-05, + "loss": 1.6083, + "step": 1163 + }, + { + "epoch": 0.281091523786525, + "grad_norm": 0.3207729458808899, + "learning_rate": 6.679129523117706e-05, + "loss": 1.7156, + "step": 1164 + }, + { + "epoch": 0.28133301134991545, + "grad_norm": 0.31085196137428284, + "learning_rate": 6.676829249792355e-05, + "loss": 1.6714, + "step": 1165 + }, + { + "epoch": 0.28157449891330594, + "grad_norm": 0.3331851661205292, + "learning_rate": 6.674527372127389e-05, + "loss": 1.9505, + "step": 1166 + }, + { + "epoch": 0.2818159864766964, + "grad_norm": 0.3235446512699127, + "learning_rate": 6.67222389150242e-05, + "loss": 1.6968, + "step": 1167 + }, + { + "epoch": 0.2820574740400869, + "grad_norm": 0.31274673342704773, + "learning_rate": 6.66991880929803e-05, + "loss": 1.7527, + "step": 1168 + }, + { + "epoch": 0.2822989616034774, + "grad_norm": 0.29938244819641113, + "learning_rate": 6.667612126895748e-05, + "loss": 1.6292, + "step": 1169 + }, + { + "epoch": 0.2825404491668679, + "grad_norm": 0.3175697922706604, + "learning_rate": 6.665303845678072e-05, + "loss": 1.8182, + "step": 1170 + }, + { + "epoch": 0.28278193673025837, + "grad_norm": 0.315643310546875, + "learning_rate": 6.662993967028455e-05, + "loss": 1.6594, + "step": 1171 + }, + { + "epoch": 0.28302342429364885, + "grad_norm": 0.33789125084877014, + "learning_rate": 6.660682492331305e-05, + "loss": 1.7726, + "step": 1172 + }, + { + "epoch": 0.28326491185703934, + "grad_norm": 0.305123507976532, + "learning_rate": 6.65836942297199e-05, + "loss": 1.7628, + "step": 1173 + }, + { + "epoch": 0.2835063994204298, + "grad_norm": 0.31694090366363525, + "learning_rate": 6.656054760336834e-05, + "loss": 1.7854, + "step": 1174 + }, + { + "epoch": 0.2837478869838203, + "grad_norm": 0.3205931484699249, + "learning_rate": 6.653738505813114e-05, + "loss": 1.856, + "step": 1175 + }, + { + "epoch": 0.2839893745472108, + "grad_norm": 0.303684800863266, + "learning_rate": 6.651420660789061e-05, + "loss": 1.6458, + "step": 1176 + }, + { + "epoch": 0.2842308621106013, + "grad_norm": 0.32911720871925354, + "learning_rate": 6.649101226653857e-05, + "loss": 1.9941, + "step": 1177 + }, + { + "epoch": 0.28447234967399176, + "grad_norm": 0.33249083161354065, + "learning_rate": 6.646780204797644e-05, + "loss": 1.8222, + "step": 1178 + }, + { + "epoch": 0.28471383723738225, + "grad_norm": 0.316508948802948, + "learning_rate": 6.644457596611508e-05, + "loss": 1.6778, + "step": 1179 + }, + { + "epoch": 0.28495532480077274, + "grad_norm": 0.31347498297691345, + "learning_rate": 6.642133403487491e-05, + "loss": 1.6783, + "step": 1180 + }, + { + "epoch": 0.2851968123641632, + "grad_norm": 0.30862924456596375, + "learning_rate": 6.639807626818579e-05, + "loss": 1.7761, + "step": 1181 + }, + { + "epoch": 0.2854382999275537, + "grad_norm": 0.31827664375305176, + "learning_rate": 6.637480267998713e-05, + "loss": 1.7578, + "step": 1182 + }, + { + "epoch": 0.2856797874909442, + "grad_norm": 0.3379991352558136, + "learning_rate": 6.63515132842278e-05, + "loss": 1.9115, + "step": 1183 + }, + { + "epoch": 0.2859212750543347, + "grad_norm": 0.3422394394874573, + "learning_rate": 6.632820809486612e-05, + "loss": 1.9046, + "step": 1184 + }, + { + "epoch": 0.28616276261772516, + "grad_norm": 0.3095594644546509, + "learning_rate": 6.630488712586992e-05, + "loss": 1.6895, + "step": 1185 + }, + { + "epoch": 0.28640425018111565, + "grad_norm": 0.31483933329582214, + "learning_rate": 6.628155039121649e-05, + "loss": 1.7782, + "step": 1186 + }, + { + "epoch": 0.28664573774450613, + "grad_norm": 0.30376529693603516, + "learning_rate": 6.625819790489248e-05, + "loss": 1.7967, + "step": 1187 + }, + { + "epoch": 0.2868872253078966, + "grad_norm": 0.29914677143096924, + "learning_rate": 6.623482968089409e-05, + "loss": 1.6851, + "step": 1188 + }, + { + "epoch": 0.2871287128712871, + "grad_norm": 0.3073113262653351, + "learning_rate": 6.62114457332269e-05, + "loss": 1.779, + "step": 1189 + }, + { + "epoch": 0.2873702004346776, + "grad_norm": 0.3353249728679657, + "learning_rate": 6.618804607590593e-05, + "loss": 1.9511, + "step": 1190 + }, + { + "epoch": 0.2876116879980681, + "grad_norm": 0.38719838857650757, + "learning_rate": 6.616463072295559e-05, + "loss": 1.8926, + "step": 1191 + }, + { + "epoch": 0.28785317556145856, + "grad_norm": 0.3062868118286133, + "learning_rate": 6.614119968840974e-05, + "loss": 1.6365, + "step": 1192 + }, + { + "epoch": 0.28809466312484905, + "grad_norm": 0.3393278121948242, + "learning_rate": 6.611775298631159e-05, + "loss": 1.8572, + "step": 1193 + }, + { + "epoch": 0.28833615068823953, + "grad_norm": 0.29543522000312805, + "learning_rate": 6.609429063071377e-05, + "loss": 1.6415, + "step": 1194 + }, + { + "epoch": 0.28857763825163, + "grad_norm": 0.29946866631507874, + "learning_rate": 6.607081263567827e-05, + "loss": 1.6446, + "step": 1195 + }, + { + "epoch": 0.2888191258150205, + "grad_norm": 0.3174428343772888, + "learning_rate": 6.604731901527649e-05, + "loss": 1.8043, + "step": 1196 + }, + { + "epoch": 0.289060613378411, + "grad_norm": 0.3169965147972107, + "learning_rate": 6.602380978358918e-05, + "loss": 1.816, + "step": 1197 + }, + { + "epoch": 0.2893021009418015, + "grad_norm": 0.315725713968277, + "learning_rate": 6.600028495470642e-05, + "loss": 1.7574, + "step": 1198 + }, + { + "epoch": 0.28954358850519196, + "grad_norm": 0.3309673070907593, + "learning_rate": 6.597674454272765e-05, + "loss": 1.9127, + "step": 1199 + }, + { + "epoch": 0.28978507606858245, + "grad_norm": 0.28161945939064026, + "learning_rate": 6.595318856176169e-05, + "loss": 1.5288, + "step": 1200 + }, + { + "epoch": 0.29002656363197293, + "grad_norm": 0.3262319564819336, + "learning_rate": 6.592961702592662e-05, + "loss": 1.836, + "step": 1201 + }, + { + "epoch": 0.2902680511953634, + "grad_norm": 0.30109328031539917, + "learning_rate": 6.590602994934993e-05, + "loss": 1.6786, + "step": 1202 + }, + { + "epoch": 0.2905095387587539, + "grad_norm": 0.30040109157562256, + "learning_rate": 6.588242734616833e-05, + "loss": 1.6941, + "step": 1203 + }, + { + "epoch": 0.2907510263221444, + "grad_norm": 0.3145768344402313, + "learning_rate": 6.58588092305279e-05, + "loss": 1.6959, + "step": 1204 + }, + { + "epoch": 0.2909925138855349, + "grad_norm": 0.32511648535728455, + "learning_rate": 6.583517561658401e-05, + "loss": 1.6826, + "step": 1205 + }, + { + "epoch": 0.29123400144892536, + "grad_norm": 0.3018747866153717, + "learning_rate": 6.58115265185013e-05, + "loss": 1.7152, + "step": 1206 + }, + { + "epoch": 0.29147548901231585, + "grad_norm": 0.34387052059173584, + "learning_rate": 6.578786195045368e-05, + "loss": 1.8679, + "step": 1207 + }, + { + "epoch": 0.29171697657570633, + "grad_norm": 0.35244396328926086, + "learning_rate": 6.576418192662436e-05, + "loss": 1.9484, + "step": 1208 + }, + { + "epoch": 0.2919584641390968, + "grad_norm": 0.3194985091686249, + "learning_rate": 6.574048646120582e-05, + "loss": 1.7235, + "step": 1209 + }, + { + "epoch": 0.2921999517024873, + "grad_norm": 0.30800020694732666, + "learning_rate": 6.571677556839976e-05, + "loss": 1.7975, + "step": 1210 + }, + { + "epoch": 0.2924414392658778, + "grad_norm": 0.3275563716888428, + "learning_rate": 6.569304926241715e-05, + "loss": 2.0784, + "step": 1211 + }, + { + "epoch": 0.2926829268292683, + "grad_norm": 0.3347807824611664, + "learning_rate": 6.566930755747821e-05, + "loss": 1.9368, + "step": 1212 + }, + { + "epoch": 0.29292441439265876, + "grad_norm": 0.3164602220058441, + "learning_rate": 6.564555046781232e-05, + "loss": 1.7644, + "step": 1213 + }, + { + "epoch": 0.29316590195604925, + "grad_norm": 0.29280054569244385, + "learning_rate": 6.562177800765819e-05, + "loss": 1.5814, + "step": 1214 + }, + { + "epoch": 0.29340738951943973, + "grad_norm": 0.29822081327438354, + "learning_rate": 6.559799019126365e-05, + "loss": 1.657, + "step": 1215 + }, + { + "epoch": 0.2936488770828302, + "grad_norm": 0.30289626121520996, + "learning_rate": 6.557418703288578e-05, + "loss": 1.8473, + "step": 1216 + }, + { + "epoch": 0.2938903646462207, + "grad_norm": 0.301372766494751, + "learning_rate": 6.555036854679083e-05, + "loss": 1.6417, + "step": 1217 + }, + { + "epoch": 0.2941318522096112, + "grad_norm": 0.31504714488983154, + "learning_rate": 6.552653474725427e-05, + "loss": 1.8229, + "step": 1218 + }, + { + "epoch": 0.2943733397730017, + "grad_norm": 0.31168702244758606, + "learning_rate": 6.550268564856071e-05, + "loss": 1.8771, + "step": 1219 + }, + { + "epoch": 0.29461482733639216, + "grad_norm": 0.31231966614723206, + "learning_rate": 6.547882126500395e-05, + "loss": 1.7629, + "step": 1220 + }, + { + "epoch": 0.29485631489978265, + "grad_norm": 0.33414316177368164, + "learning_rate": 6.545494161088696e-05, + "loss": 1.8771, + "step": 1221 + }, + { + "epoch": 0.29509780246317313, + "grad_norm": 0.3109551966190338, + "learning_rate": 6.543104670052183e-05, + "loss": 1.7909, + "step": 1222 + }, + { + "epoch": 0.2953392900265636, + "grad_norm": 0.31267592310905457, + "learning_rate": 6.540713654822984e-05, + "loss": 1.739, + "step": 1223 + }, + { + "epoch": 0.2955807775899541, + "grad_norm": 0.3014405071735382, + "learning_rate": 6.538321116834135e-05, + "loss": 1.6701, + "step": 1224 + }, + { + "epoch": 0.2958222651533446, + "grad_norm": 0.30508482456207275, + "learning_rate": 6.535927057519591e-05, + "loss": 1.6512, + "step": 1225 + }, + { + "epoch": 0.2960637527167351, + "grad_norm": 0.314610093832016, + "learning_rate": 6.533531478314212e-05, + "loss": 1.7084, + "step": 1226 + }, + { + "epoch": 0.29630524028012556, + "grad_norm": 0.33772405982017517, + "learning_rate": 6.531134380653774e-05, + "loss": 1.7774, + "step": 1227 + }, + { + "epoch": 0.29654672784351604, + "grad_norm": 0.3018490672111511, + "learning_rate": 6.52873576597496e-05, + "loss": 1.6606, + "step": 1228 + }, + { + "epoch": 0.29678821540690653, + "grad_norm": 0.30412787199020386, + "learning_rate": 6.526335635715365e-05, + "loss": 1.6831, + "step": 1229 + }, + { + "epoch": 0.297029702970297, + "grad_norm": 0.3108169734477997, + "learning_rate": 6.523933991313491e-05, + "loss": 1.6385, + "step": 1230 + }, + { + "epoch": 0.2972711905336875, + "grad_norm": 0.2972755432128906, + "learning_rate": 6.521530834208748e-05, + "loss": 1.6056, + "step": 1231 + }, + { + "epoch": 0.297512678097078, + "grad_norm": 0.3079698085784912, + "learning_rate": 6.519126165841449e-05, + "loss": 1.6153, + "step": 1232 + }, + { + "epoch": 0.2977541656604685, + "grad_norm": 0.31394216418266296, + "learning_rate": 6.516719987652819e-05, + "loss": 1.8146, + "step": 1233 + }, + { + "epoch": 0.29799565322385896, + "grad_norm": 0.3352009952068329, + "learning_rate": 6.514312301084983e-05, + "loss": 1.8685, + "step": 1234 + }, + { + "epoch": 0.29823714078724944, + "grad_norm": 0.3122904598712921, + "learning_rate": 6.511903107580973e-05, + "loss": 1.6114, + "step": 1235 + }, + { + "epoch": 0.29847862835063993, + "grad_norm": 0.32792016863822937, + "learning_rate": 6.509492408584723e-05, + "loss": 1.7083, + "step": 1236 + }, + { + "epoch": 0.2987201159140304, + "grad_norm": 0.3028082549571991, + "learning_rate": 6.507080205541068e-05, + "loss": 1.7556, + "step": 1237 + }, + { + "epoch": 0.2989616034774209, + "grad_norm": 0.3209961950778961, + "learning_rate": 6.504666499895746e-05, + "loss": 1.8044, + "step": 1238 + }, + { + "epoch": 0.2992030910408114, + "grad_norm": 0.3235771059989929, + "learning_rate": 6.502251293095394e-05, + "loss": 1.7247, + "step": 1239 + }, + { + "epoch": 0.29944457860420187, + "grad_norm": 0.32376086711883545, + "learning_rate": 6.499834586587552e-05, + "loss": 1.9585, + "step": 1240 + }, + { + "epoch": 0.29968606616759236, + "grad_norm": 0.33287283778190613, + "learning_rate": 6.497416381820656e-05, + "loss": 1.7671, + "step": 1241 + }, + { + "epoch": 0.29992755373098284, + "grad_norm": 0.3164242208003998, + "learning_rate": 6.494996680244044e-05, + "loss": 1.6698, + "step": 1242 + }, + { + "epoch": 0.30016904129437333, + "grad_norm": 0.31160667538642883, + "learning_rate": 6.49257548330794e-05, + "loss": 1.6573, + "step": 1243 + }, + { + "epoch": 0.3004105288577638, + "grad_norm": 0.3194431662559509, + "learning_rate": 6.49015279246348e-05, + "loss": 1.7531, + "step": 1244 + }, + { + "epoch": 0.3006520164211543, + "grad_norm": 0.3261190950870514, + "learning_rate": 6.487728609162684e-05, + "loss": 1.7351, + "step": 1245 + }, + { + "epoch": 0.3008935039845448, + "grad_norm": 0.31367936730384827, + "learning_rate": 6.48530293485847e-05, + "loss": 1.7731, + "step": 1246 + }, + { + "epoch": 0.30113499154793527, + "grad_norm": 0.3270440995693207, + "learning_rate": 6.48287577100465e-05, + "loss": 1.81, + "step": 1247 + }, + { + "epoch": 0.30137647911132576, + "grad_norm": 0.3517354428768158, + "learning_rate": 6.480447119055929e-05, + "loss": 1.9651, + "step": 1248 + }, + { + "epoch": 0.30161796667471624, + "grad_norm": 0.33357125520706177, + "learning_rate": 6.478016980467901e-05, + "loss": 1.8683, + "step": 1249 + }, + { + "epoch": 0.30185945423810673, + "grad_norm": 0.32203903794288635, + "learning_rate": 6.475585356697056e-05, + "loss": 1.6253, + "step": 1250 + }, + { + "epoch": 0.3021009418014972, + "grad_norm": 0.33326178789138794, + "learning_rate": 6.473152249200771e-05, + "loss": 1.848, + "step": 1251 + }, + { + "epoch": 0.3023424293648877, + "grad_norm": 0.30899542570114136, + "learning_rate": 6.470717659437309e-05, + "loss": 1.7319, + "step": 1252 + }, + { + "epoch": 0.3025839169282782, + "grad_norm": 0.30828720331192017, + "learning_rate": 6.46828158886583e-05, + "loss": 1.6604, + "step": 1253 + }, + { + "epoch": 0.30282540449166867, + "grad_norm": 0.3537421226501465, + "learning_rate": 6.465844038946374e-05, + "loss": 1.7714, + "step": 1254 + }, + { + "epoch": 0.30306689205505916, + "grad_norm": 0.31548118591308594, + "learning_rate": 6.463405011139869e-05, + "loss": 1.7093, + "step": 1255 + }, + { + "epoch": 0.30330837961844964, + "grad_norm": 0.32706400752067566, + "learning_rate": 6.460964506908133e-05, + "loss": 1.8115, + "step": 1256 + }, + { + "epoch": 0.3035498671818401, + "grad_norm": 0.317264199256897, + "learning_rate": 6.458522527713862e-05, + "loss": 1.7378, + "step": 1257 + }, + { + "epoch": 0.3037913547452306, + "grad_norm": 0.3122968375682831, + "learning_rate": 6.456079075020644e-05, + "loss": 1.783, + "step": 1258 + }, + { + "epoch": 0.3040328423086211, + "grad_norm": 0.37204188108444214, + "learning_rate": 6.453634150292943e-05, + "loss": 2.2071, + "step": 1259 + }, + { + "epoch": 0.3042743298720116, + "grad_norm": 0.3135020434856415, + "learning_rate": 6.451187754996109e-05, + "loss": 1.8169, + "step": 1260 + }, + { + "epoch": 0.30451581743540207, + "grad_norm": 0.30854514241218567, + "learning_rate": 6.448739890596373e-05, + "loss": 1.4994, + "step": 1261 + }, + { + "epoch": 0.30475730499879256, + "grad_norm": 0.3245062232017517, + "learning_rate": 6.446290558560845e-05, + "loss": 1.794, + "step": 1262 + }, + { + "epoch": 0.30499879256218304, + "grad_norm": 0.31106555461883545, + "learning_rate": 6.443839760357517e-05, + "loss": 1.706, + "step": 1263 + }, + { + "epoch": 0.3052402801255735, + "grad_norm": 0.3059476912021637, + "learning_rate": 6.441387497455259e-05, + "loss": 1.7567, + "step": 1264 + }, + { + "epoch": 0.305481767688964, + "grad_norm": 0.3407411575317383, + "learning_rate": 6.438933771323816e-05, + "loss": 1.8181, + "step": 1265 + }, + { + "epoch": 0.3057232552523545, + "grad_norm": 0.3178406357765198, + "learning_rate": 6.436478583433812e-05, + "loss": 1.5195, + "step": 1266 + }, + { + "epoch": 0.305964742815745, + "grad_norm": 0.31224748492240906, + "learning_rate": 6.43402193525675e-05, + "loss": 1.5679, + "step": 1267 + }, + { + "epoch": 0.30620623037913547, + "grad_norm": 0.36681729555130005, + "learning_rate": 6.431563828265005e-05, + "loss": 2.1015, + "step": 1268 + }, + { + "epoch": 0.30644771794252595, + "grad_norm": 0.315141886472702, + "learning_rate": 6.429104263931825e-05, + "loss": 1.6646, + "step": 1269 + }, + { + "epoch": 0.30668920550591644, + "grad_norm": 0.3100356161594391, + "learning_rate": 6.426643243731336e-05, + "loss": 1.6975, + "step": 1270 + }, + { + "epoch": 0.3069306930693069, + "grad_norm": 0.33374282717704773, + "learning_rate": 6.424180769138531e-05, + "loss": 1.8585, + "step": 1271 + }, + { + "epoch": 0.3071721806326974, + "grad_norm": 0.3375101685523987, + "learning_rate": 6.42171684162928e-05, + "loss": 1.7872, + "step": 1272 + }, + { + "epoch": 0.3074136681960879, + "grad_norm": 0.3159698247909546, + "learning_rate": 6.41925146268032e-05, + "loss": 1.8521, + "step": 1273 + }, + { + "epoch": 0.3076551557594784, + "grad_norm": 0.3374696969985962, + "learning_rate": 6.416784633769261e-05, + "loss": 1.7629, + "step": 1274 + }, + { + "epoch": 0.30789664332286887, + "grad_norm": 0.3099079430103302, + "learning_rate": 6.414316356374578e-05, + "loss": 1.6937, + "step": 1275 + }, + { + "epoch": 0.30813813088625935, + "grad_norm": 0.30903059244155884, + "learning_rate": 6.411846631975618e-05, + "loss": 1.855, + "step": 1276 + }, + { + "epoch": 0.30837961844964984, + "grad_norm": 0.30359727144241333, + "learning_rate": 6.409375462052594e-05, + "loss": 1.7491, + "step": 1277 + }, + { + "epoch": 0.3086211060130403, + "grad_norm": 0.30317601561546326, + "learning_rate": 6.406902848086582e-05, + "loss": 1.6889, + "step": 1278 + }, + { + "epoch": 0.3088625935764308, + "grad_norm": 0.2974912226200104, + "learning_rate": 6.40442879155953e-05, + "loss": 1.6461, + "step": 1279 + }, + { + "epoch": 0.3091040811398213, + "grad_norm": 0.3059878647327423, + "learning_rate": 6.401953293954246e-05, + "loss": 1.5723, + "step": 1280 + }, + { + "epoch": 0.3093455687032118, + "grad_norm": 0.31517407298088074, + "learning_rate": 6.399476356754403e-05, + "loss": 1.6743, + "step": 1281 + }, + { + "epoch": 0.30958705626660227, + "grad_norm": 0.3091956377029419, + "learning_rate": 6.396997981444537e-05, + "loss": 1.7329, + "step": 1282 + }, + { + "epoch": 0.30982854382999275, + "grad_norm": 0.3104307949542999, + "learning_rate": 6.394518169510044e-05, + "loss": 1.7746, + "step": 1283 + }, + { + "epoch": 0.31007003139338324, + "grad_norm": 0.3237158954143524, + "learning_rate": 6.392036922437185e-05, + "loss": 1.6943, + "step": 1284 + }, + { + "epoch": 0.3103115189567737, + "grad_norm": 0.33262524008750916, + "learning_rate": 6.389554241713077e-05, + "loss": 1.81, + "step": 1285 + }, + { + "epoch": 0.3105530065201642, + "grad_norm": 0.3406033515930176, + "learning_rate": 6.387070128825698e-05, + "loss": 1.6864, + "step": 1286 + }, + { + "epoch": 0.3107944940835547, + "grad_norm": 0.3164110481739044, + "learning_rate": 6.384584585263885e-05, + "loss": 1.727, + "step": 1287 + }, + { + "epoch": 0.3110359816469452, + "grad_norm": 0.32497438788414, + "learning_rate": 6.382097612517333e-05, + "loss": 1.7212, + "step": 1288 + }, + { + "epoch": 0.31127746921033567, + "grad_norm": 0.3328503668308258, + "learning_rate": 6.37960921207659e-05, + "loss": 1.6775, + "step": 1289 + }, + { + "epoch": 0.31151895677372615, + "grad_norm": 0.32597246766090393, + "learning_rate": 6.377119385433063e-05, + "loss": 1.6844, + "step": 1290 + }, + { + "epoch": 0.31176044433711664, + "grad_norm": 0.31859007477760315, + "learning_rate": 6.374628134079012e-05, + "loss": 1.6906, + "step": 1291 + }, + { + "epoch": 0.3120019319005071, + "grad_norm": 0.31793224811553955, + "learning_rate": 6.372135459507556e-05, + "loss": 1.6995, + "step": 1292 + }, + { + "epoch": 0.3122434194638976, + "grad_norm": 0.3268589973449707, + "learning_rate": 6.369641363212656e-05, + "loss": 1.6803, + "step": 1293 + }, + { + "epoch": 0.3124849070272881, + "grad_norm": 0.31107431650161743, + "learning_rate": 6.367145846689138e-05, + "loss": 1.6134, + "step": 1294 + }, + { + "epoch": 0.3127263945906786, + "grad_norm": 0.3370269536972046, + "learning_rate": 6.36464891143267e-05, + "loss": 1.8846, + "step": 1295 + }, + { + "epoch": 0.31296788215406907, + "grad_norm": 0.3170960247516632, + "learning_rate": 6.362150558939772e-05, + "loss": 1.7981, + "step": 1296 + }, + { + "epoch": 0.31320936971745955, + "grad_norm": 0.33306363224983215, + "learning_rate": 6.359650790707818e-05, + "loss": 1.8478, + "step": 1297 + }, + { + "epoch": 0.31345085728085004, + "grad_norm": 0.3303205072879791, + "learning_rate": 6.357149608235025e-05, + "loss": 1.8357, + "step": 1298 + }, + { + "epoch": 0.3136923448442405, + "grad_norm": 0.3151163160800934, + "learning_rate": 6.354647013020461e-05, + "loss": 1.4975, + "step": 1299 + }, + { + "epoch": 0.313933832407631, + "grad_norm": 0.3328797221183777, + "learning_rate": 6.35214300656404e-05, + "loss": 1.7985, + "step": 1300 + }, + { + "epoch": 0.3141753199710215, + "grad_norm": 0.3165968656539917, + "learning_rate": 6.34963759036652e-05, + "loss": 1.8291, + "step": 1301 + }, + { + "epoch": 0.314416807534412, + "grad_norm": 0.3316687047481537, + "learning_rate": 6.347130765929507e-05, + "loss": 1.8008, + "step": 1302 + }, + { + "epoch": 0.31465829509780247, + "grad_norm": 0.3069915175437927, + "learning_rate": 6.344622534755449e-05, + "loss": 1.6981, + "step": 1303 + }, + { + "epoch": 0.31489978266119295, + "grad_norm": 0.33111026883125305, + "learning_rate": 6.342112898347635e-05, + "loss": 1.8564, + "step": 1304 + }, + { + "epoch": 0.31514127022458344, + "grad_norm": 0.31829777359962463, + "learning_rate": 6.339601858210202e-05, + "loss": 1.8491, + "step": 1305 + }, + { + "epoch": 0.3153827577879739, + "grad_norm": 0.29953789710998535, + "learning_rate": 6.337089415848124e-05, + "loss": 1.686, + "step": 1306 + }, + { + "epoch": 0.3156242453513644, + "grad_norm": 0.32130008935928345, + "learning_rate": 6.334575572767214e-05, + "loss": 1.7548, + "step": 1307 + }, + { + "epoch": 0.3158657329147549, + "grad_norm": 0.31819820404052734, + "learning_rate": 6.332060330474131e-05, + "loss": 1.6416, + "step": 1308 + }, + { + "epoch": 0.3161072204781454, + "grad_norm": 0.3157431185245514, + "learning_rate": 6.329543690476368e-05, + "loss": 1.6687, + "step": 1309 + }, + { + "epoch": 0.31634870804153586, + "grad_norm": 0.32807284593582153, + "learning_rate": 6.327025654282253e-05, + "loss": 1.7713, + "step": 1310 + }, + { + "epoch": 0.31659019560492635, + "grad_norm": 0.32748138904571533, + "learning_rate": 6.324506223400957e-05, + "loss": 1.7929, + "step": 1311 + }, + { + "epoch": 0.31683168316831684, + "grad_norm": 0.3302832841873169, + "learning_rate": 6.321985399342481e-05, + "loss": 1.8137, + "step": 1312 + }, + { + "epoch": 0.3170731707317073, + "grad_norm": 0.30940601229667664, + "learning_rate": 6.319463183617669e-05, + "loss": 1.6142, + "step": 1313 + }, + { + "epoch": 0.3173146582950978, + "grad_norm": 0.30194124579429626, + "learning_rate": 6.316939577738189e-05, + "loss": 1.7115, + "step": 1314 + }, + { + "epoch": 0.3175561458584883, + "grad_norm": 0.3257052004337311, + "learning_rate": 6.314414583216548e-05, + "loss": 1.7879, + "step": 1315 + }, + { + "epoch": 0.3177976334218788, + "grad_norm": 0.324390709400177, + "learning_rate": 6.311888201566088e-05, + "loss": 1.9028, + "step": 1316 + }, + { + "epoch": 0.31803912098526926, + "grad_norm": 0.3202170431613922, + "learning_rate": 6.309360434300975e-05, + "loss": 1.7865, + "step": 1317 + }, + { + "epoch": 0.31828060854865975, + "grad_norm": 0.3127957284450531, + "learning_rate": 6.306831282936212e-05, + "loss": 1.7583, + "step": 1318 + }, + { + "epoch": 0.31852209611205023, + "grad_norm": 0.3177083730697632, + "learning_rate": 6.304300748987627e-05, + "loss": 1.7228, + "step": 1319 + }, + { + "epoch": 0.3187635836754407, + "grad_norm": 0.33234596252441406, + "learning_rate": 6.30176883397188e-05, + "loss": 1.8422, + "step": 1320 + }, + { + "epoch": 0.3190050712388312, + "grad_norm": 0.3085367679595947, + "learning_rate": 6.299235539406456e-05, + "loss": 1.7398, + "step": 1321 + }, + { + "epoch": 0.3192465588022217, + "grad_norm": 0.32077756524086, + "learning_rate": 6.296700866809667e-05, + "loss": 1.6157, + "step": 1322 + }, + { + "epoch": 0.3194880463656122, + "grad_norm": 0.30447328090667725, + "learning_rate": 6.294164817700655e-05, + "loss": 1.6457, + "step": 1323 + }, + { + "epoch": 0.31972953392900266, + "grad_norm": 0.3148518204689026, + "learning_rate": 6.291627393599383e-05, + "loss": 1.8575, + "step": 1324 + }, + { + "epoch": 0.31997102149239315, + "grad_norm": 0.30383065342903137, + "learning_rate": 6.289088596026638e-05, + "loss": 1.7007, + "step": 1325 + }, + { + "epoch": 0.32021250905578363, + "grad_norm": 0.31243109703063965, + "learning_rate": 6.286548426504033e-05, + "loss": 1.7474, + "step": 1326 + }, + { + "epoch": 0.3204539966191741, + "grad_norm": 0.3217732906341553, + "learning_rate": 6.284006886553998e-05, + "loss": 1.7636, + "step": 1327 + }, + { + "epoch": 0.3206954841825646, + "grad_norm": 0.3129735589027405, + "learning_rate": 6.281463977699793e-05, + "loss": 1.7425, + "step": 1328 + }, + { + "epoch": 0.3209369717459551, + "grad_norm": 0.33060258626937866, + "learning_rate": 6.278919701465489e-05, + "loss": 1.8192, + "step": 1329 + }, + { + "epoch": 0.3211784593093456, + "grad_norm": 0.31487327814102173, + "learning_rate": 6.276374059375983e-05, + "loss": 1.783, + "step": 1330 + }, + { + "epoch": 0.32141994687273606, + "grad_norm": 0.31960222125053406, + "learning_rate": 6.273827052956986e-05, + "loss": 1.7372, + "step": 1331 + }, + { + "epoch": 0.32166143443612655, + "grad_norm": 0.30207306146621704, + "learning_rate": 6.271278683735033e-05, + "loss": 1.5479, + "step": 1332 + }, + { + "epoch": 0.32190292199951703, + "grad_norm": 0.3223114013671875, + "learning_rate": 6.26872895323747e-05, + "loss": 1.8627, + "step": 1333 + }, + { + "epoch": 0.3221444095629075, + "grad_norm": 0.3229098618030548, + "learning_rate": 6.26617786299246e-05, + "loss": 1.8045, + "step": 1334 + }, + { + "epoch": 0.322385897126298, + "grad_norm": 0.2999171316623688, + "learning_rate": 6.263625414528983e-05, + "loss": 1.6994, + "step": 1335 + }, + { + "epoch": 0.3226273846896885, + "grad_norm": 0.30813145637512207, + "learning_rate": 6.261071609376832e-05, + "loss": 1.7092, + "step": 1336 + }, + { + "epoch": 0.322868872253079, + "grad_norm": 0.31752750277519226, + "learning_rate": 6.258516449066612e-05, + "loss": 1.8286, + "step": 1337 + }, + { + "epoch": 0.32311035981646946, + "grad_norm": 0.3602254390716553, + "learning_rate": 6.255959935129742e-05, + "loss": 1.9201, + "step": 1338 + }, + { + "epoch": 0.32335184737985995, + "grad_norm": 0.3091624975204468, + "learning_rate": 6.253402069098451e-05, + "loss": 1.7353, + "step": 1339 + }, + { + "epoch": 0.32359333494325043, + "grad_norm": 0.31246787309646606, + "learning_rate": 6.250842852505778e-05, + "loss": 1.7084, + "step": 1340 + }, + { + "epoch": 0.3238348225066409, + "grad_norm": 0.32028380036354065, + "learning_rate": 6.248282286885574e-05, + "loss": 1.7276, + "step": 1341 + }, + { + "epoch": 0.3240763100700314, + "grad_norm": 0.32509827613830566, + "learning_rate": 6.245720373772496e-05, + "loss": 1.8808, + "step": 1342 + }, + { + "epoch": 0.3243177976334219, + "grad_norm": 0.34290429949760437, + "learning_rate": 6.243157114702009e-05, + "loss": 1.9521, + "step": 1343 + }, + { + "epoch": 0.3245592851968124, + "grad_norm": 0.3149389326572418, + "learning_rate": 6.240592511210385e-05, + "loss": 1.8657, + "step": 1344 + }, + { + "epoch": 0.32480077276020286, + "grad_norm": 0.3112652599811554, + "learning_rate": 6.238026564834702e-05, + "loss": 1.6536, + "step": 1345 + }, + { + "epoch": 0.32504226032359335, + "grad_norm": 0.30453184247016907, + "learning_rate": 6.235459277112844e-05, + "loss": 1.574, + "step": 1346 + }, + { + "epoch": 0.32528374788698383, + "grad_norm": 0.3088798522949219, + "learning_rate": 6.232890649583496e-05, + "loss": 1.6068, + "step": 1347 + }, + { + "epoch": 0.3255252354503743, + "grad_norm": 0.3275122046470642, + "learning_rate": 6.230320683786148e-05, + "loss": 1.9809, + "step": 1348 + }, + { + "epoch": 0.3257667230137648, + "grad_norm": 0.31893762946128845, + "learning_rate": 6.227749381261092e-05, + "loss": 1.7996, + "step": 1349 + }, + { + "epoch": 0.3260082105771553, + "grad_norm": 0.3026633858680725, + "learning_rate": 6.22517674354942e-05, + "loss": 1.5886, + "step": 1350 + }, + { + "epoch": 0.3262496981405458, + "grad_norm": 0.31581586599349976, + "learning_rate": 6.222602772193028e-05, + "loss": 1.7078, + "step": 1351 + }, + { + "epoch": 0.32649118570393626, + "grad_norm": 0.30987709760665894, + "learning_rate": 6.220027468734605e-05, + "loss": 1.7288, + "step": 1352 + }, + { + "epoch": 0.32673267326732675, + "grad_norm": 0.3298446834087372, + "learning_rate": 6.217450834717644e-05, + "loss": 1.8196, + "step": 1353 + }, + { + "epoch": 0.32697416083071723, + "grad_norm": 0.30796289443969727, + "learning_rate": 6.214872871686433e-05, + "loss": 1.7249, + "step": 1354 + }, + { + "epoch": 0.3272156483941077, + "grad_norm": 0.30453404784202576, + "learning_rate": 6.212293581186055e-05, + "loss": 1.6672, + "step": 1355 + }, + { + "epoch": 0.3274571359574982, + "grad_norm": 0.3229547142982483, + "learning_rate": 6.209712964762393e-05, + "loss": 1.8192, + "step": 1356 + }, + { + "epoch": 0.3276986235208887, + "grad_norm": 0.33560508489608765, + "learning_rate": 6.20713102396212e-05, + "loss": 1.7354, + "step": 1357 + }, + { + "epoch": 0.3279401110842792, + "grad_norm": 0.3196660280227661, + "learning_rate": 6.204547760332705e-05, + "loss": 1.6965, + "step": 1358 + }, + { + "epoch": 0.32818159864766966, + "grad_norm": 0.3183029592037201, + "learning_rate": 6.201963175422412e-05, + "loss": 1.785, + "step": 1359 + }, + { + "epoch": 0.32842308621106014, + "grad_norm": 0.3159939646720886, + "learning_rate": 6.199377270780291e-05, + "loss": 1.6972, + "step": 1360 + }, + { + "epoch": 0.32866457377445063, + "grad_norm": 0.33196038007736206, + "learning_rate": 6.19679004795619e-05, + "loss": 1.7643, + "step": 1361 + }, + { + "epoch": 0.3289060613378411, + "grad_norm": 0.32369866967201233, + "learning_rate": 6.194201508500742e-05, + "loss": 1.8385, + "step": 1362 + }, + { + "epoch": 0.3291475489012316, + "grad_norm": 0.31358596682548523, + "learning_rate": 6.191611653965371e-05, + "loss": 1.826, + "step": 1363 + }, + { + "epoch": 0.3293890364646221, + "grad_norm": 0.3112541735172272, + "learning_rate": 6.189020485902287e-05, + "loss": 1.7407, + "step": 1364 + }, + { + "epoch": 0.3296305240280126, + "grad_norm": 0.3198995590209961, + "learning_rate": 6.186428005864492e-05, + "loss": 1.5329, + "step": 1365 + }, + { + "epoch": 0.32987201159140306, + "grad_norm": 0.3354710340499878, + "learning_rate": 6.183834215405772e-05, + "loss": 1.7694, + "step": 1366 + }, + { + "epoch": 0.33011349915479354, + "grad_norm": 0.3100753426551819, + "learning_rate": 6.181239116080693e-05, + "loss": 1.7828, + "step": 1367 + }, + { + "epoch": 0.33035498671818403, + "grad_norm": 0.3105238974094391, + "learning_rate": 6.178642709444616e-05, + "loss": 1.8108, + "step": 1368 + }, + { + "epoch": 0.3305964742815745, + "grad_norm": 0.3389638066291809, + "learning_rate": 6.176044997053677e-05, + "loss": 1.9256, + "step": 1369 + }, + { + "epoch": 0.330837961844965, + "grad_norm": 0.30497318506240845, + "learning_rate": 6.173445980464799e-05, + "loss": 1.6612, + "step": 1370 + }, + { + "epoch": 0.3310794494083555, + "grad_norm": 0.3220541179180145, + "learning_rate": 6.170845661235681e-05, + "loss": 1.6502, + "step": 1371 + }, + { + "epoch": 0.33132093697174597, + "grad_norm": 0.3109511137008667, + "learning_rate": 6.168244040924813e-05, + "loss": 1.7243, + "step": 1372 + }, + { + "epoch": 0.33156242453513646, + "grad_norm": 0.31736430525779724, + "learning_rate": 6.165641121091454e-05, + "loss": 1.8114, + "step": 1373 + }, + { + "epoch": 0.33180391209852694, + "grad_norm": 0.3181619644165039, + "learning_rate": 6.163036903295649e-05, + "loss": 1.714, + "step": 1374 + }, + { + "epoch": 0.33204539966191743, + "grad_norm": 0.33182013034820557, + "learning_rate": 6.160431389098216e-05, + "loss": 1.809, + "step": 1375 + }, + { + "epoch": 0.3322868872253079, + "grad_norm": 0.33612844347953796, + "learning_rate": 6.157824580060756e-05, + "loss": 1.7409, + "step": 1376 + }, + { + "epoch": 0.3325283747886984, + "grad_norm": 0.33391138911247253, + "learning_rate": 6.155216477745638e-05, + "loss": 1.7668, + "step": 1377 + }, + { + "epoch": 0.3327698623520889, + "grad_norm": 0.3073732852935791, + "learning_rate": 6.152607083716015e-05, + "loss": 1.7319, + "step": 1378 + }, + { + "epoch": 0.33301134991547937, + "grad_norm": 0.31348085403442383, + "learning_rate": 6.149996399535806e-05, + "loss": 1.6169, + "step": 1379 + }, + { + "epoch": 0.33325283747886986, + "grad_norm": 0.32224661111831665, + "learning_rate": 6.147384426769711e-05, + "loss": 1.7747, + "step": 1380 + }, + { + "epoch": 0.33349432504226034, + "grad_norm": 0.3120202124118805, + "learning_rate": 6.144771166983195e-05, + "loss": 1.87, + "step": 1381 + }, + { + "epoch": 0.33373581260565083, + "grad_norm": 0.3108193278312683, + "learning_rate": 6.142156621742496e-05, + "loss": 1.7512, + "step": 1382 + }, + { + "epoch": 0.3339773001690413, + "grad_norm": 0.31070417165756226, + "learning_rate": 6.13954079261463e-05, + "loss": 1.8596, + "step": 1383 + }, + { + "epoch": 0.3342187877324318, + "grad_norm": 0.3115104138851166, + "learning_rate": 6.136923681167372e-05, + "loss": 1.6334, + "step": 1384 + }, + { + "epoch": 0.3344602752958223, + "grad_norm": 0.3107805550098419, + "learning_rate": 6.134305288969273e-05, + "loss": 1.7409, + "step": 1385 + }, + { + "epoch": 0.33470176285921277, + "grad_norm": 0.32333892583847046, + "learning_rate": 6.131685617589646e-05, + "loss": 1.831, + "step": 1386 + }, + { + "epoch": 0.33494325042260326, + "grad_norm": 0.3145526349544525, + "learning_rate": 6.129064668598574e-05, + "loss": 1.8139, + "step": 1387 + }, + { + "epoch": 0.33518473798599374, + "grad_norm": 0.3273543119430542, + "learning_rate": 6.12644244356691e-05, + "loss": 1.7674, + "step": 1388 + }, + { + "epoch": 0.3354262255493842, + "grad_norm": 0.31777769327163696, + "learning_rate": 6.123818944066259e-05, + "loss": 1.7356, + "step": 1389 + }, + { + "epoch": 0.3356677131127747, + "grad_norm": 0.30964168906211853, + "learning_rate": 6.121194171669003e-05, + "loss": 1.749, + "step": 1390 + }, + { + "epoch": 0.3359092006761652, + "grad_norm": 0.3542748689651489, + "learning_rate": 6.11856812794828e-05, + "loss": 1.856, + "step": 1391 + }, + { + "epoch": 0.3361506882395557, + "grad_norm": 0.32668453454971313, + "learning_rate": 6.115940814477994e-05, + "loss": 1.877, + "step": 1392 + }, + { + "epoch": 0.33639217580294617, + "grad_norm": 0.34220948815345764, + "learning_rate": 6.113312232832804e-05, + "loss": 1.688, + "step": 1393 + }, + { + "epoch": 0.33663366336633666, + "grad_norm": 0.3136855661869049, + "learning_rate": 6.110682384588133e-05, + "loss": 1.6078, + "step": 1394 + }, + { + "epoch": 0.33687515092972714, + "grad_norm": 0.337715208530426, + "learning_rate": 6.108051271320167e-05, + "loss": 1.8654, + "step": 1395 + }, + { + "epoch": 0.3371166384931176, + "grad_norm": 0.30137553811073303, + "learning_rate": 6.105418894605841e-05, + "loss": 1.5995, + "step": 1396 + }, + { + "epoch": 0.3373581260565081, + "grad_norm": 0.32862013578414917, + "learning_rate": 6.1027852560228555e-05, + "loss": 1.8154, + "step": 1397 + }, + { + "epoch": 0.3375996136198986, + "grad_norm": 0.3268672227859497, + "learning_rate": 6.1001503571496636e-05, + "loss": 1.7151, + "step": 1398 + }, + { + "epoch": 0.3378411011832891, + "grad_norm": 0.3278553783893585, + "learning_rate": 6.097514199565473e-05, + "loss": 1.771, + "step": 1399 + }, + { + "epoch": 0.33808258874667957, + "grad_norm": 0.3203633725643158, + "learning_rate": 6.0948767848502486e-05, + "loss": 1.6725, + "step": 1400 + }, + { + "epoch": 0.33832407631007005, + "grad_norm": 0.33434566855430603, + "learning_rate": 6.0922381145847065e-05, + "loss": 1.7686, + "step": 1401 + }, + { + "epoch": 0.33856556387346054, + "grad_norm": 0.3028900921344757, + "learning_rate": 6.089598190350316e-05, + "loss": 1.6449, + "step": 1402 + }, + { + "epoch": 0.338807051436851, + "grad_norm": 0.32168394327163696, + "learning_rate": 6.086957013729297e-05, + "loss": 1.76, + "step": 1403 + }, + { + "epoch": 0.3390485390002415, + "grad_norm": 0.3260248601436615, + "learning_rate": 6.084314586304624e-05, + "loss": 1.6925, + "step": 1404 + }, + { + "epoch": 0.339290026563632, + "grad_norm": 0.3169650137424469, + "learning_rate": 6.081670909660014e-05, + "loss": 1.7216, + "step": 1405 + }, + { + "epoch": 0.3395315141270225, + "grad_norm": 0.3010064661502838, + "learning_rate": 6.0790259853799386e-05, + "loss": 1.5303, + "step": 1406 + }, + { + "epoch": 0.33977300169041297, + "grad_norm": 0.32520854473114014, + "learning_rate": 6.076379815049617e-05, + "loss": 1.785, + "step": 1407 + }, + { + "epoch": 0.34001448925380345, + "grad_norm": 0.32523801922798157, + "learning_rate": 6.0737324002550095e-05, + "loss": 1.6572, + "step": 1408 + }, + { + "epoch": 0.34025597681719394, + "grad_norm": 0.3176769018173218, + "learning_rate": 6.0710837425828314e-05, + "loss": 1.5568, + "step": 1409 + }, + { + "epoch": 0.3404974643805844, + "grad_norm": 0.3224984407424927, + "learning_rate": 6.068433843620535e-05, + "loss": 1.6022, + "step": 1410 + }, + { + "epoch": 0.3407389519439749, + "grad_norm": 0.3200245797634125, + "learning_rate": 6.065782704956319e-05, + "loss": 1.7426, + "step": 1411 + }, + { + "epoch": 0.3409804395073654, + "grad_norm": 0.3169932961463928, + "learning_rate": 6.063130328179128e-05, + "loss": 1.6143, + "step": 1412 + }, + { + "epoch": 0.3412219270707559, + "grad_norm": 0.31651175022125244, + "learning_rate": 6.0604767148786436e-05, + "loss": 1.6513, + "step": 1413 + }, + { + "epoch": 0.34146341463414637, + "grad_norm": 0.3085106313228607, + "learning_rate": 6.0578218666452914e-05, + "loss": 1.759, + "step": 1414 + }, + { + "epoch": 0.34170490219753685, + "grad_norm": 0.328730046749115, + "learning_rate": 6.055165785070239e-05, + "loss": 1.9085, + "step": 1415 + }, + { + "epoch": 0.34194638976092734, + "grad_norm": 0.30749958753585815, + "learning_rate": 6.052508471745389e-05, + "loss": 1.644, + "step": 1416 + }, + { + "epoch": 0.3421878773243178, + "grad_norm": 0.3132942020893097, + "learning_rate": 6.049849928263385e-05, + "loss": 1.7456, + "step": 1417 + }, + { + "epoch": 0.3424293648877083, + "grad_norm": 0.3153761327266693, + "learning_rate": 6.047190156217607e-05, + "loss": 1.8136, + "step": 1418 + }, + { + "epoch": 0.3426708524510988, + "grad_norm": 0.2964738607406616, + "learning_rate": 6.0445291572021716e-05, + "loss": 1.657, + "step": 1419 + }, + { + "epoch": 0.3429123400144893, + "grad_norm": 0.3104841709136963, + "learning_rate": 6.04186693281193e-05, + "loss": 1.7264, + "step": 1420 + }, + { + "epoch": 0.34315382757787977, + "grad_norm": 0.35105088353157043, + "learning_rate": 6.0392034846424696e-05, + "loss": 1.8898, + "step": 1421 + }, + { + "epoch": 0.34339531514127025, + "grad_norm": 0.33985963463783264, + "learning_rate": 6.0365388142901096e-05, + "loss": 1.8255, + "step": 1422 + }, + { + "epoch": 0.34363680270466074, + "grad_norm": 0.3295535743236542, + "learning_rate": 6.0338729233519026e-05, + "loss": 1.6857, + "step": 1423 + }, + { + "epoch": 0.34387829026805117, + "grad_norm": 0.31867682933807373, + "learning_rate": 6.0312058134256314e-05, + "loss": 1.8694, + "step": 1424 + }, + { + "epoch": 0.34411977783144165, + "grad_norm": 0.3171629011631012, + "learning_rate": 6.0285374861098125e-05, + "loss": 1.7238, + "step": 1425 + }, + { + "epoch": 0.34436126539483214, + "grad_norm": 0.3434184193611145, + "learning_rate": 6.025867943003687e-05, + "loss": 1.6924, + "step": 1426 + }, + { + "epoch": 0.3446027529582226, + "grad_norm": 0.3540340065956116, + "learning_rate": 6.02319718570723e-05, + "loss": 1.8509, + "step": 1427 + }, + { + "epoch": 0.3448442405216131, + "grad_norm": 0.3207017779350281, + "learning_rate": 6.020525215821142e-05, + "loss": 1.7741, + "step": 1428 + }, + { + "epoch": 0.3450857280850036, + "grad_norm": 0.31496745347976685, + "learning_rate": 6.0178520349468475e-05, + "loss": 1.7462, + "step": 1429 + }, + { + "epoch": 0.3453272156483941, + "grad_norm": 0.3210442066192627, + "learning_rate": 6.0151776446865015e-05, + "loss": 1.7108, + "step": 1430 + }, + { + "epoch": 0.34556870321178457, + "grad_norm": 0.3334159255027771, + "learning_rate": 6.012502046642982e-05, + "loss": 1.6989, + "step": 1431 + }, + { + "epoch": 0.34581019077517505, + "grad_norm": 0.32715243101119995, + "learning_rate": 6.00982524241989e-05, + "loss": 1.9606, + "step": 1432 + }, + { + "epoch": 0.34605167833856554, + "grad_norm": 0.3288145065307617, + "learning_rate": 6.007147233621551e-05, + "loss": 1.9522, + "step": 1433 + }, + { + "epoch": 0.346293165901956, + "grad_norm": 0.30984047055244446, + "learning_rate": 6.004468021853011e-05, + "loss": 1.7703, + "step": 1434 + }, + { + "epoch": 0.3465346534653465, + "grad_norm": 0.323690265417099, + "learning_rate": 6.001787608720037e-05, + "loss": 1.7608, + "step": 1435 + }, + { + "epoch": 0.346776141028737, + "grad_norm": 0.33015599846839905, + "learning_rate": 5.9991059958291176e-05, + "loss": 1.8368, + "step": 1436 + }, + { + "epoch": 0.3470176285921275, + "grad_norm": 0.3160457909107208, + "learning_rate": 5.9964231847874596e-05, + "loss": 1.8098, + "step": 1437 + }, + { + "epoch": 0.34725911615551797, + "grad_norm": 0.30281051993370056, + "learning_rate": 5.9937391772029855e-05, + "loss": 1.7887, + "step": 1438 + }, + { + "epoch": 0.34750060371890845, + "grad_norm": 0.315327525138855, + "learning_rate": 5.9910539746843405e-05, + "loss": 1.7365, + "step": 1439 + }, + { + "epoch": 0.34774209128229894, + "grad_norm": 0.3132166564464569, + "learning_rate": 5.988367578840881e-05, + "loss": 1.6718, + "step": 1440 + }, + { + "epoch": 0.3479835788456894, + "grad_norm": 0.32553204894065857, + "learning_rate": 5.985679991282679e-05, + "loss": 1.8002, + "step": 1441 + }, + { + "epoch": 0.3482250664090799, + "grad_norm": 0.3237243592739105, + "learning_rate": 5.9829912136205236e-05, + "loss": 1.8928, + "step": 1442 + }, + { + "epoch": 0.3484665539724704, + "grad_norm": 0.32126304507255554, + "learning_rate": 5.980301247465917e-05, + "loss": 1.6859, + "step": 1443 + }, + { + "epoch": 0.3487080415358609, + "grad_norm": 0.3168717920780182, + "learning_rate": 5.977610094431068e-05, + "loss": 1.8302, + "step": 1444 + }, + { + "epoch": 0.34894952909925137, + "grad_norm": 0.3163128197193146, + "learning_rate": 5.9749177561289063e-05, + "loss": 1.6948, + "step": 1445 + }, + { + "epoch": 0.34919101666264185, + "grad_norm": 0.3239203989505768, + "learning_rate": 5.9722242341730635e-05, + "loss": 1.7526, + "step": 1446 + }, + { + "epoch": 0.34943250422603234, + "grad_norm": 0.30871322751045227, + "learning_rate": 5.969529530177884e-05, + "loss": 1.575, + "step": 1447 + }, + { + "epoch": 0.3496739917894228, + "grad_norm": 0.3129870295524597, + "learning_rate": 5.966833645758422e-05, + "loss": 1.8075, + "step": 1448 + }, + { + "epoch": 0.3499154793528133, + "grad_norm": 0.3211073875427246, + "learning_rate": 5.9641365825304355e-05, + "loss": 1.763, + "step": 1449 + }, + { + "epoch": 0.3501569669162038, + "grad_norm": 0.32273295521736145, + "learning_rate": 5.9614383421103944e-05, + "loss": 1.8933, + "step": 1450 + }, + { + "epoch": 0.3503984544795943, + "grad_norm": 0.31030890345573425, + "learning_rate": 5.9587389261154686e-05, + "loss": 1.6552, + "step": 1451 + }, + { + "epoch": 0.35063994204298476, + "grad_norm": 0.31312838196754456, + "learning_rate": 5.956038336163534e-05, + "loss": 1.6923, + "step": 1452 + }, + { + "epoch": 0.35088142960637525, + "grad_norm": 0.3211262822151184, + "learning_rate": 5.9533365738731734e-05, + "loss": 1.7661, + "step": 1453 + }, + { + "epoch": 0.35112291716976574, + "grad_norm": 0.3056935966014862, + "learning_rate": 5.95063364086367e-05, + "loss": 1.6947, + "step": 1454 + }, + { + "epoch": 0.3513644047331562, + "grad_norm": 0.3259216547012329, + "learning_rate": 5.947929538755006e-05, + "loss": 1.836, + "step": 1455 + }, + { + "epoch": 0.3516058922965467, + "grad_norm": 0.3077600300312042, + "learning_rate": 5.94522426916787e-05, + "loss": 1.7187, + "step": 1456 + }, + { + "epoch": 0.3518473798599372, + "grad_norm": 0.3284499943256378, + "learning_rate": 5.942517833723644e-05, + "loss": 1.8225, + "step": 1457 + }, + { + "epoch": 0.3520888674233277, + "grad_norm": 0.31834086775779724, + "learning_rate": 5.939810234044413e-05, + "loss": 1.7048, + "step": 1458 + }, + { + "epoch": 0.35233035498671816, + "grad_norm": 0.3011278510093689, + "learning_rate": 5.937101471752961e-05, + "loss": 1.6252, + "step": 1459 + }, + { + "epoch": 0.35257184255010865, + "grad_norm": 0.31385111808776855, + "learning_rate": 5.934391548472763e-05, + "loss": 1.6818, + "step": 1460 + }, + { + "epoch": 0.35281333011349914, + "grad_norm": 0.32963138818740845, + "learning_rate": 5.931680465827995e-05, + "loss": 1.939, + "step": 1461 + }, + { + "epoch": 0.3530548176768896, + "grad_norm": 0.3022247552871704, + "learning_rate": 5.928968225443526e-05, + "loss": 1.7759, + "step": 1462 + }, + { + "epoch": 0.3532963052402801, + "grad_norm": 0.30561262369155884, + "learning_rate": 5.9262548289449185e-05, + "loss": 1.6501, + "step": 1463 + }, + { + "epoch": 0.3535377928036706, + "grad_norm": 0.3121855556964874, + "learning_rate": 5.9235402779584294e-05, + "loss": 1.7566, + "step": 1464 + }, + { + "epoch": 0.3537792803670611, + "grad_norm": 0.32116931676864624, + "learning_rate": 5.920824574111006e-05, + "loss": 1.726, + "step": 1465 + }, + { + "epoch": 0.35402076793045156, + "grad_norm": 0.29525837302207947, + "learning_rate": 5.918107719030287e-05, + "loss": 1.6163, + "step": 1466 + }, + { + "epoch": 0.35426225549384205, + "grad_norm": 0.3194003999233246, + "learning_rate": 5.9153897143446014e-05, + "loss": 1.7976, + "step": 1467 + }, + { + "epoch": 0.35450374305723253, + "grad_norm": 0.31026211380958557, + "learning_rate": 5.912670561682968e-05, + "loss": 1.7198, + "step": 1468 + }, + { + "epoch": 0.354745230620623, + "grad_norm": 0.31474968791007996, + "learning_rate": 5.9099502626750914e-05, + "loss": 1.6546, + "step": 1469 + }, + { + "epoch": 0.3549867181840135, + "grad_norm": 0.3168904185295105, + "learning_rate": 5.907228818951364e-05, + "loss": 1.7855, + "step": 1470 + }, + { + "epoch": 0.355228205747404, + "grad_norm": 0.33451682329177856, + "learning_rate": 5.9045062321428665e-05, + "loss": 1.7105, + "step": 1471 + }, + { + "epoch": 0.3554696933107945, + "grad_norm": 0.3296138048171997, + "learning_rate": 5.901782503881363e-05, + "loss": 1.847, + "step": 1472 + }, + { + "epoch": 0.35571118087418496, + "grad_norm": 0.29878273606300354, + "learning_rate": 5.899057635799299e-05, + "loss": 1.6533, + "step": 1473 + }, + { + "epoch": 0.35595266843757545, + "grad_norm": 0.33155831694602966, + "learning_rate": 5.896331629529809e-05, + "loss": 1.9612, + "step": 1474 + }, + { + "epoch": 0.35619415600096593, + "grad_norm": 0.3336942493915558, + "learning_rate": 5.893604486706705e-05, + "loss": 1.8694, + "step": 1475 + }, + { + "epoch": 0.3564356435643564, + "grad_norm": 0.32858115434646606, + "learning_rate": 5.890876208964482e-05, + "loss": 1.8195, + "step": 1476 + }, + { + "epoch": 0.3566771311277469, + "grad_norm": 0.3218596577644348, + "learning_rate": 5.888146797938316e-05, + "loss": 1.8801, + "step": 1477 + }, + { + "epoch": 0.3569186186911374, + "grad_norm": 0.31268176436424255, + "learning_rate": 5.885416255264059e-05, + "loss": 1.6911, + "step": 1478 + }, + { + "epoch": 0.3571601062545279, + "grad_norm": 0.32213079929351807, + "learning_rate": 5.882684582578244e-05, + "loss": 1.8121, + "step": 1479 + }, + { + "epoch": 0.35740159381791836, + "grad_norm": 0.32161325216293335, + "learning_rate": 5.879951781518083e-05, + "loss": 1.7864, + "step": 1480 + }, + { + "epoch": 0.35764308138130885, + "grad_norm": 0.32209402322769165, + "learning_rate": 5.8772178537214586e-05, + "loss": 1.8956, + "step": 1481 + }, + { + "epoch": 0.35788456894469933, + "grad_norm": 0.3203023374080658, + "learning_rate": 5.8744828008269336e-05, + "loss": 1.7863, + "step": 1482 + }, + { + "epoch": 0.3581260565080898, + "grad_norm": 0.3110294044017792, + "learning_rate": 5.871746624473744e-05, + "loss": 1.7236, + "step": 1483 + }, + { + "epoch": 0.3583675440714803, + "grad_norm": 0.3175148665904999, + "learning_rate": 5.8690093263017984e-05, + "loss": 1.8843, + "step": 1484 + }, + { + "epoch": 0.3586090316348708, + "grad_norm": 0.31239208579063416, + "learning_rate": 5.866270907951678e-05, + "loss": 1.7412, + "step": 1485 + }, + { + "epoch": 0.3588505191982613, + "grad_norm": 0.30664995312690735, + "learning_rate": 5.863531371064634e-05, + "loss": 1.7344, + "step": 1486 + }, + { + "epoch": 0.35909200676165176, + "grad_norm": 0.3216778635978699, + "learning_rate": 5.8607907172825923e-05, + "loss": 1.8317, + "step": 1487 + }, + { + "epoch": 0.35933349432504225, + "grad_norm": 0.3176087737083435, + "learning_rate": 5.858048948248143e-05, + "loss": 1.778, + "step": 1488 + }, + { + "epoch": 0.35957498188843273, + "grad_norm": 0.31520044803619385, + "learning_rate": 5.855306065604548e-05, + "loss": 1.6223, + "step": 1489 + }, + { + "epoch": 0.3598164694518232, + "grad_norm": 0.33666151762008667, + "learning_rate": 5.852562070995735e-05, + "loss": 1.8668, + "step": 1490 + }, + { + "epoch": 0.3600579570152137, + "grad_norm": 0.3103683292865753, + "learning_rate": 5.849816966066298e-05, + "loss": 1.8146, + "step": 1491 + }, + { + "epoch": 0.3602994445786042, + "grad_norm": 0.32813334465026855, + "learning_rate": 5.8470707524615e-05, + "loss": 1.9376, + "step": 1492 + }, + { + "epoch": 0.3605409321419947, + "grad_norm": 0.33966293931007385, + "learning_rate": 5.844323431827263e-05, + "loss": 2.0089, + "step": 1493 + }, + { + "epoch": 0.36078241970538516, + "grad_norm": 0.29662173986434937, + "learning_rate": 5.8415750058101765e-05, + "loss": 1.6096, + "step": 1494 + }, + { + "epoch": 0.36102390726877565, + "grad_norm": 0.3011605441570282, + "learning_rate": 5.83882547605749e-05, + "loss": 1.6289, + "step": 1495 + }, + { + "epoch": 0.36126539483216613, + "grad_norm": 0.3044760227203369, + "learning_rate": 5.8360748442171164e-05, + "loss": 1.7737, + "step": 1496 + }, + { + "epoch": 0.3615068823955566, + "grad_norm": 0.31246650218963623, + "learning_rate": 5.833323111937629e-05, + "loss": 1.7599, + "step": 1497 + }, + { + "epoch": 0.3617483699589471, + "grad_norm": 0.30395039916038513, + "learning_rate": 5.830570280868258e-05, + "loss": 1.6438, + "step": 1498 + }, + { + "epoch": 0.3619898575223376, + "grad_norm": 0.3342861831188202, + "learning_rate": 5.827816352658896e-05, + "loss": 1.9117, + "step": 1499 + }, + { + "epoch": 0.3622313450857281, + "grad_norm": 0.3126901090145111, + "learning_rate": 5.825061328960091e-05, + "loss": 1.8322, + "step": 1500 + }, + { + "epoch": 0.36247283264911856, + "grad_norm": 0.325332909822464, + "learning_rate": 5.822305211423049e-05, + "loss": 1.8047, + "step": 1501 + }, + { + "epoch": 0.36271432021250904, + "grad_norm": 0.3215937614440918, + "learning_rate": 5.819548001699628e-05, + "loss": 1.8229, + "step": 1502 + }, + { + "epoch": 0.36295580777589953, + "grad_norm": 0.32431450486183167, + "learning_rate": 5.816789701442345e-05, + "loss": 1.7385, + "step": 1503 + }, + { + "epoch": 0.36319729533929, + "grad_norm": 0.3194507956504822, + "learning_rate": 5.8140303123043676e-05, + "loss": 1.6355, + "step": 1504 + }, + { + "epoch": 0.3634387829026805, + "grad_norm": 0.31866469979286194, + "learning_rate": 5.811269835939518e-05, + "loss": 1.7696, + "step": 1505 + }, + { + "epoch": 0.363680270466071, + "grad_norm": 0.30973389744758606, + "learning_rate": 5.808508274002269e-05, + "loss": 1.6875, + "step": 1506 + }, + { + "epoch": 0.3639217580294615, + "grad_norm": 0.31541547179222107, + "learning_rate": 5.805745628147744e-05, + "loss": 1.6931, + "step": 1507 + }, + { + "epoch": 0.36416324559285196, + "grad_norm": 0.31543099880218506, + "learning_rate": 5.802981900031716e-05, + "loss": 1.7594, + "step": 1508 + }, + { + "epoch": 0.36440473315624244, + "grad_norm": 0.3169846832752228, + "learning_rate": 5.8002170913106074e-05, + "loss": 1.8439, + "step": 1509 + }, + { + "epoch": 0.36464622071963293, + "grad_norm": 0.31679767370224, + "learning_rate": 5.797451203641488e-05, + "loss": 1.7327, + "step": 1510 + }, + { + "epoch": 0.3648877082830234, + "grad_norm": 0.30597200989723206, + "learning_rate": 5.794684238682072e-05, + "loss": 1.68, + "step": 1511 + }, + { + "epoch": 0.3651291958464139, + "grad_norm": 0.36071524024009705, + "learning_rate": 5.7919161980907236e-05, + "loss": 1.9643, + "step": 1512 + }, + { + "epoch": 0.3653706834098044, + "grad_norm": 0.306130975484848, + "learning_rate": 5.789147083526449e-05, + "loss": 1.5648, + "step": 1513 + }, + { + "epoch": 0.36561217097319487, + "grad_norm": 0.3169862926006317, + "learning_rate": 5.7863768966488966e-05, + "loss": 1.7462, + "step": 1514 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 0.31784337759017944, + "learning_rate": 5.783605639118362e-05, + "loss": 1.749, + "step": 1515 + }, + { + "epoch": 0.36609514609997584, + "grad_norm": 0.33231326937675476, + "learning_rate": 5.780833312595777e-05, + "loss": 1.6741, + "step": 1516 + }, + { + "epoch": 0.36633663366336633, + "grad_norm": 0.3198108673095703, + "learning_rate": 5.7780599187427186e-05, + "loss": 1.8175, + "step": 1517 + }, + { + "epoch": 0.3665781212267568, + "grad_norm": 0.32270848751068115, + "learning_rate": 5.775285459221401e-05, + "loss": 1.8811, + "step": 1518 + }, + { + "epoch": 0.3668196087901473, + "grad_norm": 0.328346848487854, + "learning_rate": 5.772509935694678e-05, + "loss": 1.6793, + "step": 1519 + }, + { + "epoch": 0.3670610963535378, + "grad_norm": 0.29311639070510864, + "learning_rate": 5.7697333498260414e-05, + "loss": 1.6357, + "step": 1520 + }, + { + "epoch": 0.36730258391692827, + "grad_norm": 0.3062235414981842, + "learning_rate": 5.7669557032796184e-05, + "loss": 1.6904, + "step": 1521 + }, + { + "epoch": 0.36754407148031876, + "grad_norm": 0.3087918162345886, + "learning_rate": 5.764176997720175e-05, + "loss": 1.7203, + "step": 1522 + }, + { + "epoch": 0.36778555904370924, + "grad_norm": 0.2941713333129883, + "learning_rate": 5.761397234813106e-05, + "loss": 1.5707, + "step": 1523 + }, + { + "epoch": 0.36802704660709973, + "grad_norm": 0.3183874487876892, + "learning_rate": 5.7586164162244474e-05, + "loss": 1.7364, + "step": 1524 + }, + { + "epoch": 0.3682685341704902, + "grad_norm": 0.2950633466243744, + "learning_rate": 5.7558345436208616e-05, + "loss": 1.5168, + "step": 1525 + }, + { + "epoch": 0.3685100217338807, + "grad_norm": 0.3116483986377716, + "learning_rate": 5.753051618669646e-05, + "loss": 1.8518, + "step": 1526 + }, + { + "epoch": 0.3687515092972712, + "grad_norm": 0.3113264739513397, + "learning_rate": 5.7502676430387275e-05, + "loss": 1.7688, + "step": 1527 + }, + { + "epoch": 0.36899299686066167, + "grad_norm": 0.3159504532814026, + "learning_rate": 5.747482618396666e-05, + "loss": 1.7198, + "step": 1528 + }, + { + "epoch": 0.36923448442405216, + "grad_norm": 0.34909993410110474, + "learning_rate": 5.744696546412642e-05, + "loss": 1.8096, + "step": 1529 + }, + { + "epoch": 0.36947597198744264, + "grad_norm": 0.31155431270599365, + "learning_rate": 5.741909428756473e-05, + "loss": 1.7383, + "step": 1530 + }, + { + "epoch": 0.3697174595508331, + "grad_norm": 0.3173414468765259, + "learning_rate": 5.7391212670985985e-05, + "loss": 1.8322, + "step": 1531 + }, + { + "epoch": 0.3699589471142236, + "grad_norm": 0.3144669830799103, + "learning_rate": 5.736332063110084e-05, + "loss": 1.7064, + "step": 1532 + }, + { + "epoch": 0.3702004346776141, + "grad_norm": 0.31089121103286743, + "learning_rate": 5.733541818462621e-05, + "loss": 1.6687, + "step": 1533 + }, + { + "epoch": 0.3704419222410046, + "grad_norm": 0.3142034411430359, + "learning_rate": 5.7307505348285216e-05, + "loss": 1.6096, + "step": 1534 + }, + { + "epoch": 0.37068340980439507, + "grad_norm": 0.3087711036205292, + "learning_rate": 5.7279582138807264e-05, + "loss": 1.6961, + "step": 1535 + }, + { + "epoch": 0.37092489736778556, + "grad_norm": 0.2999480664730072, + "learning_rate": 5.725164857292791e-05, + "loss": 1.6879, + "step": 1536 + }, + { + "epoch": 0.37116638493117604, + "grad_norm": 0.32123640179634094, + "learning_rate": 5.7223704667388965e-05, + "loss": 1.8965, + "step": 1537 + }, + { + "epoch": 0.3714078724945665, + "grad_norm": 0.29996874928474426, + "learning_rate": 5.719575043893842e-05, + "loss": 1.6886, + "step": 1538 + }, + { + "epoch": 0.371649360057957, + "grad_norm": 0.32067954540252686, + "learning_rate": 5.716778590433045e-05, + "loss": 1.7657, + "step": 1539 + }, + { + "epoch": 0.3718908476213475, + "grad_norm": 0.3116958737373352, + "learning_rate": 5.713981108032542e-05, + "loss": 1.7947, + "step": 1540 + }, + { + "epoch": 0.372132335184738, + "grad_norm": 0.30600202083587646, + "learning_rate": 5.711182598368983e-05, + "loss": 1.7915, + "step": 1541 + }, + { + "epoch": 0.37237382274812847, + "grad_norm": 0.3116418421268463, + "learning_rate": 5.7083830631196375e-05, + "loss": 1.6921, + "step": 1542 + }, + { + "epoch": 0.37261531031151895, + "grad_norm": 0.31770211458206177, + "learning_rate": 5.705582503962388e-05, + "loss": 1.6573, + "step": 1543 + }, + { + "epoch": 0.37285679787490944, + "grad_norm": 0.31877562403678894, + "learning_rate": 5.702780922575733e-05, + "loss": 1.8058, + "step": 1544 + }, + { + "epoch": 0.3730982854382999, + "grad_norm": 0.32386425137519836, + "learning_rate": 5.699978320638777e-05, + "loss": 1.8911, + "step": 1545 + }, + { + "epoch": 0.3733397730016904, + "grad_norm": 0.31875795125961304, + "learning_rate": 5.697174699831244e-05, + "loss": 1.6746, + "step": 1546 + }, + { + "epoch": 0.3735812605650809, + "grad_norm": 0.30674871802330017, + "learning_rate": 5.694370061833464e-05, + "loss": 1.6765, + "step": 1547 + }, + { + "epoch": 0.3738227481284714, + "grad_norm": 0.3357049822807312, + "learning_rate": 5.691564408326379e-05, + "loss": 1.7836, + "step": 1548 + }, + { + "epoch": 0.37406423569186187, + "grad_norm": 0.318651020526886, + "learning_rate": 5.688757740991537e-05, + "loss": 1.6588, + "step": 1549 + }, + { + "epoch": 0.37430572325525235, + "grad_norm": 0.3196345567703247, + "learning_rate": 5.6859500615110956e-05, + "loss": 1.7283, + "step": 1550 + }, + { + "epoch": 0.37454721081864284, + "grad_norm": 0.35835352540016174, + "learning_rate": 5.6831413715678197e-05, + "loss": 1.8224, + "step": 1551 + }, + { + "epoch": 0.3747886983820333, + "grad_norm": 0.30183541774749756, + "learning_rate": 5.680331672845078e-05, + "loss": 1.6194, + "step": 1552 + }, + { + "epoch": 0.3750301859454238, + "grad_norm": 0.318406343460083, + "learning_rate": 5.6775209670268436e-05, + "loss": 1.7579, + "step": 1553 + }, + { + "epoch": 0.3752716735088143, + "grad_norm": 0.3073185980319977, + "learning_rate": 5.6747092557976966e-05, + "loss": 1.6283, + "step": 1554 + }, + { + "epoch": 0.3755131610722048, + "grad_norm": 0.3035070598125458, + "learning_rate": 5.671896540842815e-05, + "loss": 1.7404, + "step": 1555 + }, + { + "epoch": 0.37575464863559527, + "grad_norm": 0.3201872706413269, + "learning_rate": 5.66908282384798e-05, + "loss": 1.8483, + "step": 1556 + }, + { + "epoch": 0.37599613619898575, + "grad_norm": 0.3132006525993347, + "learning_rate": 5.6662681064995776e-05, + "loss": 1.696, + "step": 1557 + }, + { + "epoch": 0.37623762376237624, + "grad_norm": 0.31123900413513184, + "learning_rate": 5.663452390484586e-05, + "loss": 1.6547, + "step": 1558 + }, + { + "epoch": 0.3764791113257667, + "grad_norm": 0.3195400834083557, + "learning_rate": 5.660635677490587e-05, + "loss": 1.7253, + "step": 1559 + }, + { + "epoch": 0.3767205988891572, + "grad_norm": 0.30685698986053467, + "learning_rate": 5.657817969205759e-05, + "loss": 1.6781, + "step": 1560 + }, + { + "epoch": 0.3769620864525477, + "grad_norm": 0.3170833885669708, + "learning_rate": 5.654999267318877e-05, + "loss": 1.7026, + "step": 1561 + }, + { + "epoch": 0.3772035740159382, + "grad_norm": 0.3388971984386444, + "learning_rate": 5.652179573519309e-05, + "loss": 1.763, + "step": 1562 + }, + { + "epoch": 0.37744506157932867, + "grad_norm": 0.33410897850990295, + "learning_rate": 5.6493588894970205e-05, + "loss": 1.7048, + "step": 1563 + }, + { + "epoch": 0.37768654914271915, + "grad_norm": 0.3090329170227051, + "learning_rate": 5.646537216942571e-05, + "loss": 1.6734, + "step": 1564 + }, + { + "epoch": 0.37792803670610964, + "grad_norm": 0.3300420343875885, + "learning_rate": 5.6437145575471086e-05, + "loss": 1.8244, + "step": 1565 + }, + { + "epoch": 0.3781695242695001, + "grad_norm": 0.34012481570243835, + "learning_rate": 5.640890913002377e-05, + "loss": 1.827, + "step": 1566 + }, + { + "epoch": 0.3784110118328906, + "grad_norm": 0.31980302929878235, + "learning_rate": 5.638066285000708e-05, + "loss": 1.685, + "step": 1567 + }, + { + "epoch": 0.3786524993962811, + "grad_norm": 0.32107704877853394, + "learning_rate": 5.6352406752350225e-05, + "loss": 1.8356, + "step": 1568 + }, + { + "epoch": 0.3788939869596716, + "grad_norm": 0.3096439838409424, + "learning_rate": 5.632414085398832e-05, + "loss": 1.5647, + "step": 1569 + }, + { + "epoch": 0.37913547452306207, + "grad_norm": 0.3330332934856415, + "learning_rate": 5.6295865171862357e-05, + "loss": 1.7864, + "step": 1570 + }, + { + "epoch": 0.37937696208645255, + "grad_norm": 0.34433725476264954, + "learning_rate": 5.6267579722919126e-05, + "loss": 1.8842, + "step": 1571 + }, + { + "epoch": 0.37961844964984304, + "grad_norm": 0.3113875091075897, + "learning_rate": 5.623928452411136e-05, + "loss": 1.8177, + "step": 1572 + }, + { + "epoch": 0.3798599372132335, + "grad_norm": 0.32041990756988525, + "learning_rate": 5.621097959239759e-05, + "loss": 1.7717, + "step": 1573 + }, + { + "epoch": 0.380101424776624, + "grad_norm": 0.3251771628856659, + "learning_rate": 5.618266494474218e-05, + "loss": 1.7525, + "step": 1574 + }, + { + "epoch": 0.3803429123400145, + "grad_norm": 0.3050212860107422, + "learning_rate": 5.6154340598115316e-05, + "loss": 1.6738, + "step": 1575 + }, + { + "epoch": 0.380584399903405, + "grad_norm": 0.3157691955566406, + "learning_rate": 5.612600656949302e-05, + "loss": 1.748, + "step": 1576 + }, + { + "epoch": 0.38082588746679547, + "grad_norm": 0.340025931596756, + "learning_rate": 5.609766287585711e-05, + "loss": 1.8142, + "step": 1577 + }, + { + "epoch": 0.38106737503018595, + "grad_norm": 0.3133496046066284, + "learning_rate": 5.606930953419517e-05, + "loss": 1.6432, + "step": 1578 + }, + { + "epoch": 0.38130886259357644, + "grad_norm": 0.3089030683040619, + "learning_rate": 5.6040946561500594e-05, + "loss": 1.6171, + "step": 1579 + }, + { + "epoch": 0.3815503501569669, + "grad_norm": 0.31117933988571167, + "learning_rate": 5.601257397477252e-05, + "loss": 1.7193, + "step": 1580 + }, + { + "epoch": 0.3817918377203574, + "grad_norm": 0.31880703568458557, + "learning_rate": 5.59841917910159e-05, + "loss": 1.7253, + "step": 1581 + }, + { + "epoch": 0.3820333252837479, + "grad_norm": 0.3133091330528259, + "learning_rate": 5.595580002724137e-05, + "loss": 1.522, + "step": 1582 + }, + { + "epoch": 0.3822748128471384, + "grad_norm": 0.34288087487220764, + "learning_rate": 5.592739870046537e-05, + "loss": 1.8463, + "step": 1583 + }, + { + "epoch": 0.38251630041052886, + "grad_norm": 0.3295765817165375, + "learning_rate": 5.589898782771004e-05, + "loss": 1.8319, + "step": 1584 + }, + { + "epoch": 0.38275778797391935, + "grad_norm": 0.3100754916667938, + "learning_rate": 5.587056742600322e-05, + "loss": 1.6536, + "step": 1585 + }, + { + "epoch": 0.38299927553730984, + "grad_norm": 0.3132288157939911, + "learning_rate": 5.5842137512378524e-05, + "loss": 1.6085, + "step": 1586 + }, + { + "epoch": 0.3832407631007003, + "grad_norm": 0.3163909912109375, + "learning_rate": 5.5813698103875206e-05, + "loss": 1.5761, + "step": 1587 + }, + { + "epoch": 0.3834822506640908, + "grad_norm": 0.3432241976261139, + "learning_rate": 5.578524921753824e-05, + "loss": 1.6101, + "step": 1588 + }, + { + "epoch": 0.3837237382274813, + "grad_norm": 0.307777464389801, + "learning_rate": 5.5756790870418274e-05, + "loss": 1.7152, + "step": 1589 + }, + { + "epoch": 0.3839652257908718, + "grad_norm": 0.31681734323501587, + "learning_rate": 5.572832307957163e-05, + "loss": 1.7113, + "step": 1590 + }, + { + "epoch": 0.38420671335426226, + "grad_norm": 0.33259811997413635, + "learning_rate": 5.569984586206028e-05, + "loss": 1.6767, + "step": 1591 + }, + { + "epoch": 0.38444820091765275, + "grad_norm": 0.32139548659324646, + "learning_rate": 5.567135923495187e-05, + "loss": 1.8471, + "step": 1592 + }, + { + "epoch": 0.38468968848104323, + "grad_norm": 0.3762575089931488, + "learning_rate": 5.564286321531965e-05, + "loss": 1.8994, + "step": 1593 + }, + { + "epoch": 0.3849311760444337, + "grad_norm": 0.32005301117897034, + "learning_rate": 5.5614357820242525e-05, + "loss": 1.6572, + "step": 1594 + }, + { + "epoch": 0.3851726636078242, + "grad_norm": 0.3230658769607544, + "learning_rate": 5.558584306680501e-05, + "loss": 1.7142, + "step": 1595 + }, + { + "epoch": 0.3854141511712147, + "grad_norm": 0.31494832038879395, + "learning_rate": 5.5557318972097226e-05, + "loss": 1.7121, + "step": 1596 + }, + { + "epoch": 0.3856556387346052, + "grad_norm": 0.31691285967826843, + "learning_rate": 5.552878555321491e-05, + "loss": 1.707, + "step": 1597 + }, + { + "epoch": 0.38589712629799566, + "grad_norm": 0.3042242228984833, + "learning_rate": 5.550024282725936e-05, + "loss": 1.6972, + "step": 1598 + }, + { + "epoch": 0.38613861386138615, + "grad_norm": 0.30469492077827454, + "learning_rate": 5.5471690811337494e-05, + "loss": 1.6826, + "step": 1599 + }, + { + "epoch": 0.38638010142477663, + "grad_norm": 0.31376826763153076, + "learning_rate": 5.5443129522561734e-05, + "loss": 1.6751, + "step": 1600 + }, + { + "epoch": 0.3866215889881671, + "grad_norm": 0.31754270195961, + "learning_rate": 5.541455897805012e-05, + "loss": 1.737, + "step": 1601 + }, + { + "epoch": 0.3868630765515576, + "grad_norm": 0.3262483775615692, + "learning_rate": 5.538597919492621e-05, + "loss": 1.5888, + "step": 1602 + }, + { + "epoch": 0.3871045641149481, + "grad_norm": 0.32402339577674866, + "learning_rate": 5.53573901903191e-05, + "loss": 1.7864, + "step": 1603 + }, + { + "epoch": 0.3873460516783386, + "grad_norm": 0.321544349193573, + "learning_rate": 5.5328791981363435e-05, + "loss": 1.7058, + "step": 1604 + }, + { + "epoch": 0.38758753924172906, + "grad_norm": 0.31502535939216614, + "learning_rate": 5.530018458519935e-05, + "loss": 1.7887, + "step": 1605 + }, + { + "epoch": 0.38782902680511955, + "grad_norm": 0.30999353528022766, + "learning_rate": 5.5271568018972474e-05, + "loss": 1.7674, + "step": 1606 + }, + { + "epoch": 0.38807051436851003, + "grad_norm": 0.31182703375816345, + "learning_rate": 5.5242942299833984e-05, + "loss": 1.7194, + "step": 1607 + }, + { + "epoch": 0.3883120019319005, + "grad_norm": 0.31964096426963806, + "learning_rate": 5.5214307444940495e-05, + "loss": 1.6184, + "step": 1608 + }, + { + "epoch": 0.388553489495291, + "grad_norm": 0.3312462866306305, + "learning_rate": 5.5185663471454115e-05, + "loss": 1.7521, + "step": 1609 + }, + { + "epoch": 0.3887949770586815, + "grad_norm": 0.3217445909976959, + "learning_rate": 5.515701039654243e-05, + "loss": 1.7388, + "step": 1610 + }, + { + "epoch": 0.389036464622072, + "grad_norm": 0.3201799690723419, + "learning_rate": 5.512834823737846e-05, + "loss": 1.7771, + "step": 1611 + }, + { + "epoch": 0.38927795218546246, + "grad_norm": 0.3134850561618805, + "learning_rate": 5.509967701114068e-05, + "loss": 1.7415, + "step": 1612 + }, + { + "epoch": 0.38951943974885295, + "grad_norm": 0.3229968845844269, + "learning_rate": 5.5070996735013e-05, + "loss": 1.8011, + "step": 1613 + }, + { + "epoch": 0.38976092731224343, + "grad_norm": 0.3218373656272888, + "learning_rate": 5.5042307426184735e-05, + "loss": 1.7577, + "step": 1614 + }, + { + "epoch": 0.3900024148756339, + "grad_norm": 0.3155001997947693, + "learning_rate": 5.501360910185063e-05, + "loss": 1.7679, + "step": 1615 + }, + { + "epoch": 0.3902439024390244, + "grad_norm": 0.3090244233608246, + "learning_rate": 5.4984901779210855e-05, + "loss": 1.6268, + "step": 1616 + }, + { + "epoch": 0.3904853900024149, + "grad_norm": 0.3192291557788849, + "learning_rate": 5.495618547547094e-05, + "loss": 1.8372, + "step": 1617 + }, + { + "epoch": 0.3907268775658054, + "grad_norm": 0.31249117851257324, + "learning_rate": 5.4927460207841796e-05, + "loss": 1.8075, + "step": 1618 + }, + { + "epoch": 0.39096836512919586, + "grad_norm": 0.3381814658641815, + "learning_rate": 5.4898725993539735e-05, + "loss": 1.9058, + "step": 1619 + }, + { + "epoch": 0.39120985269258635, + "grad_norm": 0.31739556789398193, + "learning_rate": 5.48699828497864e-05, + "loss": 1.8154, + "step": 1620 + }, + { + "epoch": 0.39145134025597683, + "grad_norm": 0.3291226029396057, + "learning_rate": 5.484123079380882e-05, + "loss": 1.7774, + "step": 1621 + }, + { + "epoch": 0.3916928278193673, + "grad_norm": 0.30211007595062256, + "learning_rate": 5.4812469842839334e-05, + "loss": 1.6932, + "step": 1622 + }, + { + "epoch": 0.3919343153827578, + "grad_norm": 0.3263416886329651, + "learning_rate": 5.478370001411564e-05, + "loss": 1.7078, + "step": 1623 + }, + { + "epoch": 0.3921758029461483, + "grad_norm": 0.3306402266025543, + "learning_rate": 5.475492132488072e-05, + "loss": 1.8144, + "step": 1624 + }, + { + "epoch": 0.3924172905095388, + "grad_norm": 0.31025224924087524, + "learning_rate": 5.472613379238289e-05, + "loss": 1.7594, + "step": 1625 + }, + { + "epoch": 0.39265877807292926, + "grad_norm": 0.31240203976631165, + "learning_rate": 5.4697337433875785e-05, + "loss": 1.7538, + "step": 1626 + }, + { + "epoch": 0.39290026563631975, + "grad_norm": 0.32786843180656433, + "learning_rate": 5.466853226661828e-05, + "loss": 1.7343, + "step": 1627 + }, + { + "epoch": 0.39314175319971023, + "grad_norm": 0.31915387511253357, + "learning_rate": 5.4639718307874576e-05, + "loss": 1.6627, + "step": 1628 + }, + { + "epoch": 0.3933832407631007, + "grad_norm": 0.3256676495075226, + "learning_rate": 5.461089557491413e-05, + "loss": 1.6906, + "step": 1629 + }, + { + "epoch": 0.3936247283264912, + "grad_norm": 0.33956941962242126, + "learning_rate": 5.4582064085011644e-05, + "loss": 1.7723, + "step": 1630 + }, + { + "epoch": 0.3938662158898817, + "grad_norm": 0.32009264826774597, + "learning_rate": 5.455322385544707e-05, + "loss": 1.6601, + "step": 1631 + }, + { + "epoch": 0.3941077034532722, + "grad_norm": 0.3323977291584015, + "learning_rate": 5.452437490350562e-05, + "loss": 1.8277, + "step": 1632 + }, + { + "epoch": 0.39434919101666266, + "grad_norm": 0.3169059753417969, + "learning_rate": 5.449551724647772e-05, + "loss": 1.7505, + "step": 1633 + }, + { + "epoch": 0.39459067858005314, + "grad_norm": 0.3227306306362152, + "learning_rate": 5.446665090165901e-05, + "loss": 1.9677, + "step": 1634 + }, + { + "epoch": 0.39483216614344363, + "grad_norm": 0.32162293791770935, + "learning_rate": 5.4437775886350334e-05, + "loss": 1.7486, + "step": 1635 + }, + { + "epoch": 0.3950736537068341, + "grad_norm": 0.3121008574962616, + "learning_rate": 5.440889221785773e-05, + "loss": 1.6298, + "step": 1636 + }, + { + "epoch": 0.3953151412702246, + "grad_norm": 0.3139210343360901, + "learning_rate": 5.437999991349246e-05, + "loss": 1.7676, + "step": 1637 + }, + { + "epoch": 0.3955566288336151, + "grad_norm": 0.30618348717689514, + "learning_rate": 5.43510989905709e-05, + "loss": 1.7309, + "step": 1638 + }, + { + "epoch": 0.3957981163970056, + "grad_norm": 0.325777143239975, + "learning_rate": 5.432218946641465e-05, + "loss": 1.6668, + "step": 1639 + }, + { + "epoch": 0.39603960396039606, + "grad_norm": 0.3241610527038574, + "learning_rate": 5.429327135835042e-05, + "loss": 1.6995, + "step": 1640 + }, + { + "epoch": 0.39628109152378654, + "grad_norm": 0.3215353786945343, + "learning_rate": 5.4264344683710096e-05, + "loss": 1.8294, + "step": 1641 + }, + { + "epoch": 0.39652257908717703, + "grad_norm": 0.3343597650527954, + "learning_rate": 5.4235409459830664e-05, + "loss": 1.7734, + "step": 1642 + }, + { + "epoch": 0.3967640666505675, + "grad_norm": 0.3067845404148102, + "learning_rate": 5.4206465704054295e-05, + "loss": 1.5428, + "step": 1643 + }, + { + "epoch": 0.397005554213958, + "grad_norm": 0.31020960211753845, + "learning_rate": 5.41775134337282e-05, + "loss": 1.7374, + "step": 1644 + }, + { + "epoch": 0.3972470417773485, + "grad_norm": 0.3085239827632904, + "learning_rate": 5.414855266620475e-05, + "loss": 1.5923, + "step": 1645 + }, + { + "epoch": 0.39748852934073897, + "grad_norm": 0.30102473497390747, + "learning_rate": 5.411958341884137e-05, + "loss": 1.6841, + "step": 1646 + }, + { + "epoch": 0.39773001690412946, + "grad_norm": 0.32308852672576904, + "learning_rate": 5.4090605709000574e-05, + "loss": 1.8351, + "step": 1647 + }, + { + "epoch": 0.39797150446751994, + "grad_norm": 0.34821414947509766, + "learning_rate": 5.406161955405e-05, + "loss": 1.809, + "step": 1648 + }, + { + "epoch": 0.39821299203091043, + "grad_norm": 0.36567896604537964, + "learning_rate": 5.403262497136227e-05, + "loss": 1.7037, + "step": 1649 + }, + { + "epoch": 0.3984544795943009, + "grad_norm": 0.3330789804458618, + "learning_rate": 5.4003621978315095e-05, + "loss": 1.7455, + "step": 1650 + }, + { + "epoch": 0.3986959671576914, + "grad_norm": 0.3309069871902466, + "learning_rate": 5.3974610592291235e-05, + "loss": 1.9542, + "step": 1651 + }, + { + "epoch": 0.3989374547210819, + "grad_norm": 0.3199659287929535, + "learning_rate": 5.394559083067845e-05, + "loss": 1.683, + "step": 1652 + }, + { + "epoch": 0.39917894228447237, + "grad_norm": 0.3193099796772003, + "learning_rate": 5.3916562710869556e-05, + "loss": 1.6782, + "step": 1653 + }, + { + "epoch": 0.39942042984786286, + "grad_norm": 0.33188971877098083, + "learning_rate": 5.388752625026237e-05, + "loss": 1.6784, + "step": 1654 + }, + { + "epoch": 0.39966191741125334, + "grad_norm": 0.3204587399959564, + "learning_rate": 5.385848146625969e-05, + "loss": 1.7851, + "step": 1655 + }, + { + "epoch": 0.39990340497464383, + "grad_norm": 0.33102720975875854, + "learning_rate": 5.38294283762693e-05, + "loss": 1.783, + "step": 1656 + }, + { + "epoch": 0.4001448925380343, + "grad_norm": 0.33277207612991333, + "learning_rate": 5.380036699770399e-05, + "loss": 1.858, + "step": 1657 + }, + { + "epoch": 0.4003863801014248, + "grad_norm": 0.3018147945404053, + "learning_rate": 5.377129734798149e-05, + "loss": 1.6409, + "step": 1658 + }, + { + "epoch": 0.4006278676648153, + "grad_norm": 0.339412122964859, + "learning_rate": 5.3742219444524504e-05, + "loss": 1.7925, + "step": 1659 + }, + { + "epoch": 0.40086935522820577, + "grad_norm": 0.32353413105010986, + "learning_rate": 5.371313330476068e-05, + "loss": 1.6374, + "step": 1660 + }, + { + "epoch": 0.40111084279159626, + "grad_norm": 0.31544435024261475, + "learning_rate": 5.368403894612261e-05, + "loss": 1.7994, + "step": 1661 + }, + { + "epoch": 0.40135233035498674, + "grad_norm": 0.3048715889453888, + "learning_rate": 5.365493638604777e-05, + "loss": 1.7828, + "step": 1662 + }, + { + "epoch": 0.4015938179183772, + "grad_norm": 0.33015862107276917, + "learning_rate": 5.362582564197863e-05, + "loss": 1.7849, + "step": 1663 + }, + { + "epoch": 0.4018353054817677, + "grad_norm": 0.3231745660305023, + "learning_rate": 5.359670673136247e-05, + "loss": 1.5934, + "step": 1664 + }, + { + "epoch": 0.4020767930451582, + "grad_norm": 0.35362470149993896, + "learning_rate": 5.3567579671651544e-05, + "loss": 1.7968, + "step": 1665 + }, + { + "epoch": 0.4023182806085487, + "grad_norm": 0.3389405608177185, + "learning_rate": 5.353844448030297e-05, + "loss": 1.7623, + "step": 1666 + }, + { + "epoch": 0.40255976817193917, + "grad_norm": 0.32034578919410706, + "learning_rate": 5.35093011747787e-05, + "loss": 1.732, + "step": 1667 + }, + { + "epoch": 0.40280125573532966, + "grad_norm": 0.33826392889022827, + "learning_rate": 5.348014977254558e-05, + "loss": 1.8616, + "step": 1668 + }, + { + "epoch": 0.40304274329872014, + "grad_norm": 0.33229494094848633, + "learning_rate": 5.345099029107533e-05, + "loss": 1.8809, + "step": 1669 + }, + { + "epoch": 0.4032842308621106, + "grad_norm": 0.3166428506374359, + "learning_rate": 5.342182274784447e-05, + "loss": 1.7468, + "step": 1670 + }, + { + "epoch": 0.4035257184255011, + "grad_norm": 0.3228038251399994, + "learning_rate": 5.339264716033438e-05, + "loss": 1.6577, + "step": 1671 + }, + { + "epoch": 0.4037672059888916, + "grad_norm": 0.30518126487731934, + "learning_rate": 5.336346354603125e-05, + "loss": 1.7055, + "step": 1672 + }, + { + "epoch": 0.4040086935522821, + "grad_norm": 0.32699069380760193, + "learning_rate": 5.3334271922426085e-05, + "loss": 1.6633, + "step": 1673 + }, + { + "epoch": 0.40425018111567257, + "grad_norm": 0.32846981287002563, + "learning_rate": 5.3305072307014684e-05, + "loss": 1.7801, + "step": 1674 + }, + { + "epoch": 0.40449166867906305, + "grad_norm": 0.3315163254737854, + "learning_rate": 5.3275864717297624e-05, + "loss": 1.8734, + "step": 1675 + }, + { + "epoch": 0.40473315624245354, + "grad_norm": 0.32653379440307617, + "learning_rate": 5.324664917078032e-05, + "loss": 1.8171, + "step": 1676 + }, + { + "epoch": 0.404974643805844, + "grad_norm": 0.310324102640152, + "learning_rate": 5.3217425684972876e-05, + "loss": 1.6035, + "step": 1677 + }, + { + "epoch": 0.4052161313692345, + "grad_norm": 0.30552801489830017, + "learning_rate": 5.318819427739021e-05, + "loss": 1.5884, + "step": 1678 + }, + { + "epoch": 0.405457618932625, + "grad_norm": 0.31171873211860657, + "learning_rate": 5.315895496555197e-05, + "loss": 1.7287, + "step": 1679 + }, + { + "epoch": 0.4056991064960155, + "grad_norm": 0.4216386377811432, + "learning_rate": 5.312970776698252e-05, + "loss": 1.8202, + "step": 1680 + }, + { + "epoch": 0.40594059405940597, + "grad_norm": 0.30686837434768677, + "learning_rate": 5.3100452699211e-05, + "loss": 1.6182, + "step": 1681 + }, + { + "epoch": 0.40618208162279645, + "grad_norm": 0.31551510095596313, + "learning_rate": 5.307118977977122e-05, + "loss": 1.7769, + "step": 1682 + }, + { + "epoch": 0.4064235691861869, + "grad_norm": 0.32668325304985046, + "learning_rate": 5.3041919026201714e-05, + "loss": 1.8593, + "step": 1683 + }, + { + "epoch": 0.40666505674957737, + "grad_norm": 0.3222865164279938, + "learning_rate": 5.301264045604573e-05, + "loss": 1.7289, + "step": 1684 + }, + { + "epoch": 0.40690654431296785, + "grad_norm": 0.319663941860199, + "learning_rate": 5.2983354086851146e-05, + "loss": 1.7866, + "step": 1685 + }, + { + "epoch": 0.40714803187635834, + "grad_norm": 0.3232978582382202, + "learning_rate": 5.295405993617059e-05, + "loss": 1.761, + "step": 1686 + }, + { + "epoch": 0.4073895194397488, + "grad_norm": 0.31206750869750977, + "learning_rate": 5.29247580215613e-05, + "loss": 1.5944, + "step": 1687 + }, + { + "epoch": 0.4076310070031393, + "grad_norm": 0.3296249508857727, + "learning_rate": 5.289544836058517e-05, + "loss": 1.7709, + "step": 1688 + }, + { + "epoch": 0.4078724945665298, + "grad_norm": 0.30123740434646606, + "learning_rate": 5.286613097080876e-05, + "loss": 1.6726, + "step": 1689 + }, + { + "epoch": 0.4081139821299203, + "grad_norm": 0.3386242091655731, + "learning_rate": 5.2836805869803255e-05, + "loss": 1.9382, + "step": 1690 + }, + { + "epoch": 0.40835546969331077, + "grad_norm": 0.3159593641757965, + "learning_rate": 5.2807473075144445e-05, + "loss": 1.6599, + "step": 1691 + }, + { + "epoch": 0.40859695725670125, + "grad_norm": 0.33741095662117004, + "learning_rate": 5.277813260441274e-05, + "loss": 1.9443, + "step": 1692 + }, + { + "epoch": 0.40883844482009174, + "grad_norm": 0.3497970998287201, + "learning_rate": 5.274878447519318e-05, + "loss": 1.9927, + "step": 1693 + }, + { + "epoch": 0.4090799323834822, + "grad_norm": 0.3178463578224182, + "learning_rate": 5.271942870507534e-05, + "loss": 1.5977, + "step": 1694 + }, + { + "epoch": 0.4093214199468727, + "grad_norm": 0.31562668085098267, + "learning_rate": 5.2690065311653416e-05, + "loss": 1.6623, + "step": 1695 + }, + { + "epoch": 0.4095629075102632, + "grad_norm": 0.320965975522995, + "learning_rate": 5.2660694312526154e-05, + "loss": 1.7709, + "step": 1696 + }, + { + "epoch": 0.4098043950736537, + "grad_norm": 0.31700804829597473, + "learning_rate": 5.263131572529688e-05, + "loss": 1.8144, + "step": 1697 + }, + { + "epoch": 0.41004588263704417, + "grad_norm": 0.3084293007850647, + "learning_rate": 5.260192956757343e-05, + "loss": 1.713, + "step": 1698 + }, + { + "epoch": 0.41028737020043465, + "grad_norm": 0.31365668773651123, + "learning_rate": 5.2572535856968225e-05, + "loss": 1.7754, + "step": 1699 + }, + { + "epoch": 0.41052885776382514, + "grad_norm": 0.3074451684951782, + "learning_rate": 5.254313461109816e-05, + "loss": 1.7289, + "step": 1700 + }, + { + "epoch": 0.4107703453272156, + "grad_norm": 0.31787192821502686, + "learning_rate": 5.251372584758471e-05, + "loss": 1.7623, + "step": 1701 + }, + { + "epoch": 0.4110118328906061, + "grad_norm": 0.3212306797504425, + "learning_rate": 5.2484309584053794e-05, + "loss": 1.7933, + "step": 1702 + }, + { + "epoch": 0.4112533204539966, + "grad_norm": 0.32138124108314514, + "learning_rate": 5.2454885838135846e-05, + "loss": 1.7146, + "step": 1703 + }, + { + "epoch": 0.4114948080173871, + "grad_norm": 0.3186517357826233, + "learning_rate": 5.242545462746581e-05, + "loss": 1.7416, + "step": 1704 + }, + { + "epoch": 0.41173629558077757, + "grad_norm": 0.29256436228752136, + "learning_rate": 5.2396015969683086e-05, + "loss": 1.5187, + "step": 1705 + }, + { + "epoch": 0.41197778314416805, + "grad_norm": 0.30244478583335876, + "learning_rate": 5.23665698824315e-05, + "loss": 1.7165, + "step": 1706 + }, + { + "epoch": 0.41221927070755854, + "grad_norm": 0.31398919224739075, + "learning_rate": 5.2337116383359415e-05, + "loss": 1.6597, + "step": 1707 + }, + { + "epoch": 0.412460758270949, + "grad_norm": 0.31446996331214905, + "learning_rate": 5.2307655490119546e-05, + "loss": 1.6449, + "step": 1708 + }, + { + "epoch": 0.4127022458343395, + "grad_norm": 0.2963344156742096, + "learning_rate": 5.227818722036911e-05, + "loss": 1.5533, + "step": 1709 + }, + { + "epoch": 0.41294373339773, + "grad_norm": 0.3177819550037384, + "learning_rate": 5.22487115917697e-05, + "loss": 1.7537, + "step": 1710 + }, + { + "epoch": 0.4131852209611205, + "grad_norm": 0.300102174282074, + "learning_rate": 5.221922862198735e-05, + "loss": 1.6307, + "step": 1711 + }, + { + "epoch": 0.41342670852451097, + "grad_norm": 0.30934983491897583, + "learning_rate": 5.218973832869247e-05, + "loss": 1.676, + "step": 1712 + }, + { + "epoch": 0.41366819608790145, + "grad_norm": 0.32400190830230713, + "learning_rate": 5.216024072955988e-05, + "loss": 1.7713, + "step": 1713 + }, + { + "epoch": 0.41390968365129194, + "grad_norm": 0.3195970356464386, + "learning_rate": 5.213073584226874e-05, + "loss": 1.7983, + "step": 1714 + }, + { + "epoch": 0.4141511712146824, + "grad_norm": 0.33358579874038696, + "learning_rate": 5.210122368450263e-05, + "loss": 1.7347, + "step": 1715 + }, + { + "epoch": 0.4143926587780729, + "grad_norm": 0.32983365654945374, + "learning_rate": 5.207170427394946e-05, + "loss": 1.8242, + "step": 1716 + }, + { + "epoch": 0.4146341463414634, + "grad_norm": 0.3056759238243103, + "learning_rate": 5.204217762830149e-05, + "loss": 1.7023, + "step": 1717 + }, + { + "epoch": 0.4148756339048539, + "grad_norm": 0.3775116205215454, + "learning_rate": 5.201264376525531e-05, + "loss": 1.5639, + "step": 1718 + }, + { + "epoch": 0.41511712146824437, + "grad_norm": 0.31459784507751465, + "learning_rate": 5.1983102702511846e-05, + "loss": 1.7042, + "step": 1719 + }, + { + "epoch": 0.41535860903163485, + "grad_norm": 0.331863135099411, + "learning_rate": 5.195355445777634e-05, + "loss": 1.8641, + "step": 1720 + }, + { + "epoch": 0.41560009659502534, + "grad_norm": 0.3138774335384369, + "learning_rate": 5.1923999048758324e-05, + "loss": 1.7186, + "step": 1721 + }, + { + "epoch": 0.4158415841584158, + "grad_norm": 0.30020132660865784, + "learning_rate": 5.1894436493171646e-05, + "loss": 1.6064, + "step": 1722 + }, + { + "epoch": 0.4160830717218063, + "grad_norm": 0.28870853781700134, + "learning_rate": 5.186486680873442e-05, + "loss": 1.3951, + "step": 1723 + }, + { + "epoch": 0.4163245592851968, + "grad_norm": 0.3199133276939392, + "learning_rate": 5.1835290013169025e-05, + "loss": 1.8676, + "step": 1724 + }, + { + "epoch": 0.4165660468485873, + "grad_norm": 0.3255309462547302, + "learning_rate": 5.180570612420214e-05, + "loss": 1.8917, + "step": 1725 + }, + { + "epoch": 0.41680753441197776, + "grad_norm": 0.3171748220920563, + "learning_rate": 5.1776115159564664e-05, + "loss": 1.7169, + "step": 1726 + }, + { + "epoch": 0.41704902197536825, + "grad_norm": 0.3134252727031708, + "learning_rate": 5.1746517136991706e-05, + "loss": 1.8052, + "step": 1727 + }, + { + "epoch": 0.41729050953875874, + "grad_norm": 0.34795841574668884, + "learning_rate": 5.171691207422269e-05, + "loss": 1.8521, + "step": 1728 + }, + { + "epoch": 0.4175319971021492, + "grad_norm": 0.32155516743659973, + "learning_rate": 5.168729998900118e-05, + "loss": 1.7458, + "step": 1729 + }, + { + "epoch": 0.4177734846655397, + "grad_norm": 0.3200514614582062, + "learning_rate": 5.165768089907501e-05, + "loss": 1.7987, + "step": 1730 + }, + { + "epoch": 0.4180149722289302, + "grad_norm": 0.3235030174255371, + "learning_rate": 5.162805482219615e-05, + "loss": 1.6807, + "step": 1731 + }, + { + "epoch": 0.4182564597923207, + "grad_norm": 0.3196185529232025, + "learning_rate": 5.159842177612081e-05, + "loss": 1.7257, + "step": 1732 + }, + { + "epoch": 0.41849794735571116, + "grad_norm": 0.3183957636356354, + "learning_rate": 5.1568781778609336e-05, + "loss": 1.8169, + "step": 1733 + }, + { + "epoch": 0.41873943491910165, + "grad_norm": 0.31911373138427734, + "learning_rate": 5.153913484742629e-05, + "loss": 1.6534, + "step": 1734 + }, + { + "epoch": 0.41898092248249214, + "grad_norm": 0.31826508045196533, + "learning_rate": 5.1509481000340345e-05, + "loss": 1.7524, + "step": 1735 + }, + { + "epoch": 0.4192224100458826, + "grad_norm": 0.2995862364768982, + "learning_rate": 5.147982025512434e-05, + "loss": 1.6639, + "step": 1736 + }, + { + "epoch": 0.4194638976092731, + "grad_norm": 0.34899625182151794, + "learning_rate": 5.1450152629555245e-05, + "loss": 1.7866, + "step": 1737 + }, + { + "epoch": 0.4197053851726636, + "grad_norm": 0.3255622684955597, + "learning_rate": 5.142047814141414e-05, + "loss": 1.7003, + "step": 1738 + }, + { + "epoch": 0.4199468727360541, + "grad_norm": 0.328663170337677, + "learning_rate": 5.139079680848623e-05, + "loss": 1.7505, + "step": 1739 + }, + { + "epoch": 0.42018836029944456, + "grad_norm": 0.29785701632499695, + "learning_rate": 5.136110864856084e-05, + "loss": 1.5607, + "step": 1740 + }, + { + "epoch": 0.42042984786283505, + "grad_norm": 0.3232966363430023, + "learning_rate": 5.133141367943136e-05, + "loss": 1.8571, + "step": 1741 + }, + { + "epoch": 0.42067133542622553, + "grad_norm": 0.29955655336380005, + "learning_rate": 5.130171191889526e-05, + "loss": 1.6468, + "step": 1742 + }, + { + "epoch": 0.420912822989616, + "grad_norm": 0.32290521264076233, + "learning_rate": 5.127200338475411e-05, + "loss": 1.9304, + "step": 1743 + }, + { + "epoch": 0.4211543105530065, + "grad_norm": 0.33338356018066406, + "learning_rate": 5.124228809481351e-05, + "loss": 1.9154, + "step": 1744 + }, + { + "epoch": 0.421395798116397, + "grad_norm": 0.32244834303855896, + "learning_rate": 5.1212566066883116e-05, + "loss": 1.7334, + "step": 1745 + }, + { + "epoch": 0.4216372856797875, + "grad_norm": 0.3112456202507019, + "learning_rate": 5.118283731877663e-05, + "loss": 1.7167, + "step": 1746 + }, + { + "epoch": 0.42187877324317796, + "grad_norm": 0.3183744251728058, + "learning_rate": 5.1153101868311776e-05, + "loss": 1.7666, + "step": 1747 + }, + { + "epoch": 0.42212026080656845, + "grad_norm": 0.3148494362831116, + "learning_rate": 5.1123359733310284e-05, + "loss": 1.7667, + "step": 1748 + }, + { + "epoch": 0.42236174836995893, + "grad_norm": 0.33314794301986694, + "learning_rate": 5.109361093159793e-05, + "loss": 1.8291, + "step": 1749 + }, + { + "epoch": 0.4226032359333494, + "grad_norm": 0.3257341682910919, + "learning_rate": 5.106385548100444e-05, + "loss": 1.8156, + "step": 1750 + }, + { + "epoch": 0.4228447234967399, + "grad_norm": 0.314256876707077, + "learning_rate": 5.103409339936354e-05, + "loss": 1.8064, + "step": 1751 + }, + { + "epoch": 0.4230862110601304, + "grad_norm": 0.31599828600883484, + "learning_rate": 5.100432470451294e-05, + "loss": 1.6887, + "step": 1752 + }, + { + "epoch": 0.4233276986235209, + "grad_norm": 0.30414825677871704, + "learning_rate": 5.0974549414294316e-05, + "loss": 1.6797, + "step": 1753 + }, + { + "epoch": 0.42356918618691136, + "grad_norm": 0.3340110182762146, + "learning_rate": 5.0944767546553264e-05, + "loss": 1.9084, + "step": 1754 + }, + { + "epoch": 0.42381067375030185, + "grad_norm": 0.3074990510940552, + "learning_rate": 5.091497911913938e-05, + "loss": 1.6124, + "step": 1755 + }, + { + "epoch": 0.42405216131369233, + "grad_norm": 0.31545865535736084, + "learning_rate": 5.088518414990614e-05, + "loss": 1.6553, + "step": 1756 + }, + { + "epoch": 0.4242936488770828, + "grad_norm": 0.3098644018173218, + "learning_rate": 5.0855382656710944e-05, + "loss": 1.6836, + "step": 1757 + }, + { + "epoch": 0.4245351364404733, + "grad_norm": 0.32377690076828003, + "learning_rate": 5.082557465741513e-05, + "loss": 1.8453, + "step": 1758 + }, + { + "epoch": 0.4247766240038638, + "grad_norm": 0.3402831554412842, + "learning_rate": 5.0795760169883926e-05, + "loss": 1.7824, + "step": 1759 + }, + { + "epoch": 0.4250181115672543, + "grad_norm": 0.30646243691444397, + "learning_rate": 5.076593921198644e-05, + "loss": 1.6201, + "step": 1760 + }, + { + "epoch": 0.42525959913064476, + "grad_norm": 0.3204982876777649, + "learning_rate": 5.0736111801595674e-05, + "loss": 1.8092, + "step": 1761 + }, + { + "epoch": 0.42550108669403525, + "grad_norm": 0.34092098474502563, + "learning_rate": 5.0706277956588456e-05, + "loss": 1.8603, + "step": 1762 + }, + { + "epoch": 0.42574257425742573, + "grad_norm": 0.3115682899951935, + "learning_rate": 5.0676437694845544e-05, + "loss": 1.7216, + "step": 1763 + }, + { + "epoch": 0.4259840618208162, + "grad_norm": 0.2900623083114624, + "learning_rate": 5.064659103425145e-05, + "loss": 1.5347, + "step": 1764 + }, + { + "epoch": 0.4262255493842067, + "grad_norm": 0.34669458866119385, + "learning_rate": 5.0616737992694595e-05, + "loss": 2.0433, + "step": 1765 + }, + { + "epoch": 0.4264670369475972, + "grad_norm": 0.32388561964035034, + "learning_rate": 5.0586878588067215e-05, + "loss": 1.8193, + "step": 1766 + }, + { + "epoch": 0.4267085245109877, + "grad_norm": 0.32599443197250366, + "learning_rate": 5.0557012838265326e-05, + "loss": 1.6705, + "step": 1767 + }, + { + "epoch": 0.42695001207437816, + "grad_norm": 0.31643036007881165, + "learning_rate": 5.052714076118875e-05, + "loss": 1.6169, + "step": 1768 + }, + { + "epoch": 0.42719149963776865, + "grad_norm": 0.301062673330307, + "learning_rate": 5.0497262374741136e-05, + "loss": 1.706, + "step": 1769 + }, + { + "epoch": 0.42743298720115913, + "grad_norm": 0.31782886385917664, + "learning_rate": 5.046737769682989e-05, + "loss": 1.9235, + "step": 1770 + }, + { + "epoch": 0.4276744747645496, + "grad_norm": 0.3196124732494354, + "learning_rate": 5.043748674536618e-05, + "loss": 1.7779, + "step": 1771 + }, + { + "epoch": 0.4279159623279401, + "grad_norm": 0.31023970246315, + "learning_rate": 5.0407589538264974e-05, + "loss": 1.6582, + "step": 1772 + }, + { + "epoch": 0.4281574498913306, + "grad_norm": 0.31953737139701843, + "learning_rate": 5.0377686093444945e-05, + "loss": 1.6437, + "step": 1773 + }, + { + "epoch": 0.4283989374547211, + "grad_norm": 0.3527478873729706, + "learning_rate": 5.03477764288285e-05, + "loss": 1.9902, + "step": 1774 + }, + { + "epoch": 0.42864042501811156, + "grad_norm": 0.3176495134830475, + "learning_rate": 5.0317860562341825e-05, + "loss": 1.7831, + "step": 1775 + }, + { + "epoch": 0.42888191258150204, + "grad_norm": 0.3193947374820709, + "learning_rate": 5.02879385119148e-05, + "loss": 1.752, + "step": 1776 + }, + { + "epoch": 0.42912340014489253, + "grad_norm": 0.322971910238266, + "learning_rate": 5.025801029548097e-05, + "loss": 1.6216, + "step": 1777 + }, + { + "epoch": 0.429364887708283, + "grad_norm": 0.3086382746696472, + "learning_rate": 5.022807593097765e-05, + "loss": 1.6701, + "step": 1778 + }, + { + "epoch": 0.4296063752716735, + "grad_norm": 0.3198978900909424, + "learning_rate": 5.0198135436345776e-05, + "loss": 1.7816, + "step": 1779 + }, + { + "epoch": 0.429847862835064, + "grad_norm": 0.3353576362133026, + "learning_rate": 5.0168188829529986e-05, + "loss": 1.762, + "step": 1780 + }, + { + "epoch": 0.4300893503984545, + "grad_norm": 0.3208022713661194, + "learning_rate": 5.0138236128478587e-05, + "loss": 1.8141, + "step": 1781 + }, + { + "epoch": 0.43033083796184496, + "grad_norm": 0.32314246892929077, + "learning_rate": 5.010827735114351e-05, + "loss": 1.7433, + "step": 1782 + }, + { + "epoch": 0.43057232552523544, + "grad_norm": 0.3072111904621124, + "learning_rate": 5.0078312515480356e-05, + "loss": 1.7538, + "step": 1783 + }, + { + "epoch": 0.43081381308862593, + "grad_norm": 0.316180020570755, + "learning_rate": 5.004834163944836e-05, + "loss": 1.7431, + "step": 1784 + }, + { + "epoch": 0.4310553006520164, + "grad_norm": 0.3349752724170685, + "learning_rate": 5.0018364741010345e-05, + "loss": 1.791, + "step": 1785 + }, + { + "epoch": 0.4312967882154069, + "grad_norm": 0.31984061002731323, + "learning_rate": 4.998838183813277e-05, + "loss": 1.838, + "step": 1786 + }, + { + "epoch": 0.4315382757787974, + "grad_norm": 0.31683188676834106, + "learning_rate": 4.995839294878569e-05, + "loss": 1.8307, + "step": 1787 + }, + { + "epoch": 0.43177976334218787, + "grad_norm": 0.32636767625808716, + "learning_rate": 4.992839809094276e-05, + "loss": 1.8039, + "step": 1788 + }, + { + "epoch": 0.43202125090557836, + "grad_norm": 0.3164781928062439, + "learning_rate": 4.9898397282581164e-05, + "loss": 1.8272, + "step": 1789 + }, + { + "epoch": 0.43226273846896884, + "grad_norm": 0.3037387430667877, + "learning_rate": 4.986839054168171e-05, + "loss": 1.6591, + "step": 1790 + }, + { + "epoch": 0.43250422603235933, + "grad_norm": 0.31159907579421997, + "learning_rate": 4.983837788622872e-05, + "loss": 1.708, + "step": 1791 + }, + { + "epoch": 0.4327457135957498, + "grad_norm": 0.3006117343902588, + "learning_rate": 4.980835933421008e-05, + "loss": 1.5216, + "step": 1792 + }, + { + "epoch": 0.4329872011591403, + "grad_norm": 0.320086270570755, + "learning_rate": 4.9778334903617225e-05, + "loss": 1.6478, + "step": 1793 + }, + { + "epoch": 0.4332286887225308, + "grad_norm": 0.3265068829059601, + "learning_rate": 4.9748304612445076e-05, + "loss": 1.833, + "step": 1794 + }, + { + "epoch": 0.43347017628592127, + "grad_norm": 0.32293495535850525, + "learning_rate": 4.971826847869209e-05, + "loss": 1.72, + "step": 1795 + }, + { + "epoch": 0.43371166384931176, + "grad_norm": 0.28712642192840576, + "learning_rate": 4.9688226520360225e-05, + "loss": 1.5015, + "step": 1796 + }, + { + "epoch": 0.43395315141270224, + "grad_norm": 0.34202128648757935, + "learning_rate": 4.965817875545493e-05, + "loss": 1.7086, + "step": 1797 + }, + { + "epoch": 0.43419463897609273, + "grad_norm": 0.3475635051727295, + "learning_rate": 4.962812520198512e-05, + "loss": 1.876, + "step": 1798 + }, + { + "epoch": 0.4344361265394832, + "grad_norm": 0.3257412314414978, + "learning_rate": 4.959806587796321e-05, + "loss": 1.7665, + "step": 1799 + }, + { + "epoch": 0.4346776141028737, + "grad_norm": 0.30491873621940613, + "learning_rate": 4.956800080140503e-05, + "loss": 1.7476, + "step": 1800 + }, + { + "epoch": 0.4349191016662642, + "grad_norm": 0.32391414046287537, + "learning_rate": 4.953792999032989e-05, + "loss": 1.8963, + "step": 1801 + }, + { + "epoch": 0.43516058922965467, + "grad_norm": 0.3363605737686157, + "learning_rate": 4.950785346276054e-05, + "loss": 1.7886, + "step": 1802 + }, + { + "epoch": 0.43540207679304516, + "grad_norm": 0.3271222412586212, + "learning_rate": 4.947777123672314e-05, + "loss": 1.8712, + "step": 1803 + }, + { + "epoch": 0.43564356435643564, + "grad_norm": 0.3130126893520355, + "learning_rate": 4.9447683330247254e-05, + "loss": 1.7719, + "step": 1804 + }, + { + "epoch": 0.4358850519198261, + "grad_norm": 0.33484476804733276, + "learning_rate": 4.941758976136588e-05, + "loss": 1.8265, + "step": 1805 + }, + { + "epoch": 0.4361265394832166, + "grad_norm": 0.3352862298488617, + "learning_rate": 4.93874905481154e-05, + "loss": 1.8212, + "step": 1806 + }, + { + "epoch": 0.4363680270466071, + "grad_norm": 0.3315581679344177, + "learning_rate": 4.935738570853557e-05, + "loss": 1.7995, + "step": 1807 + }, + { + "epoch": 0.4366095146099976, + "grad_norm": 0.3371587097644806, + "learning_rate": 4.93272752606695e-05, + "loss": 1.8149, + "step": 1808 + }, + { + "epoch": 0.43685100217338807, + "grad_norm": 0.32511308789253235, + "learning_rate": 4.9297159222563735e-05, + "loss": 1.8111, + "step": 1809 + }, + { + "epoch": 0.43709248973677856, + "grad_norm": 0.32551050186157227, + "learning_rate": 4.926703761226808e-05, + "loss": 1.5647, + "step": 1810 + }, + { + "epoch": 0.43733397730016904, + "grad_norm": 0.30943354964256287, + "learning_rate": 4.9236910447835735e-05, + "loss": 1.6284, + "step": 1811 + }, + { + "epoch": 0.4375754648635595, + "grad_norm": 0.3279415965080261, + "learning_rate": 4.920677774732321e-05, + "loss": 1.8771, + "step": 1812 + }, + { + "epoch": 0.43781695242695, + "grad_norm": 0.32760724425315857, + "learning_rate": 4.917663952879033e-05, + "loss": 1.5721, + "step": 1813 + }, + { + "epoch": 0.4380584399903405, + "grad_norm": 0.3225950598716736, + "learning_rate": 4.914649581030025e-05, + "loss": 1.7678, + "step": 1814 + }, + { + "epoch": 0.438299927553731, + "grad_norm": 0.31700098514556885, + "learning_rate": 4.91163466099194e-05, + "loss": 1.6486, + "step": 1815 + }, + { + "epoch": 0.43854141511712147, + "grad_norm": 0.3183005154132843, + "learning_rate": 4.9086191945717476e-05, + "loss": 1.5372, + "step": 1816 + }, + { + "epoch": 0.43878290268051195, + "grad_norm": 0.3154526352882385, + "learning_rate": 4.905603183576751e-05, + "loss": 1.619, + "step": 1817 + }, + { + "epoch": 0.43902439024390244, + "grad_norm": 0.32441118359565735, + "learning_rate": 4.902586629814574e-05, + "loss": 1.7581, + "step": 1818 + }, + { + "epoch": 0.4392658778072929, + "grad_norm": 0.31786707043647766, + "learning_rate": 4.899569535093167e-05, + "loss": 1.6648, + "step": 1819 + }, + { + "epoch": 0.4395073653706834, + "grad_norm": 0.30324339866638184, + "learning_rate": 4.8965519012208085e-05, + "loss": 1.6787, + "step": 1820 + }, + { + "epoch": 0.4397488529340739, + "grad_norm": 0.3461436629295349, + "learning_rate": 4.893533730006095e-05, + "loss": 1.6268, + "step": 1821 + }, + { + "epoch": 0.4399903404974644, + "grad_norm": 0.3194788098335266, + "learning_rate": 4.890515023257946e-05, + "loss": 1.8323, + "step": 1822 + }, + { + "epoch": 0.44023182806085487, + "grad_norm": 0.3068380057811737, + "learning_rate": 4.887495782785605e-05, + "loss": 1.8317, + "step": 1823 + }, + { + "epoch": 0.44047331562424535, + "grad_norm": 0.3199669420719147, + "learning_rate": 4.8844760103986346e-05, + "loss": 1.8499, + "step": 1824 + }, + { + "epoch": 0.44071480318763584, + "grad_norm": 0.3064357042312622, + "learning_rate": 4.881455707906911e-05, + "loss": 1.6994, + "step": 1825 + }, + { + "epoch": 0.4409562907510263, + "grad_norm": 0.32749029994010925, + "learning_rate": 4.8784348771206366e-05, + "loss": 1.742, + "step": 1826 + }, + { + "epoch": 0.4411977783144168, + "grad_norm": 0.29773974418640137, + "learning_rate": 4.875413519850323e-05, + "loss": 1.5753, + "step": 1827 + }, + { + "epoch": 0.4414392658778073, + "grad_norm": 0.314562052488327, + "learning_rate": 4.872391637906802e-05, + "loss": 1.729, + "step": 1828 + }, + { + "epoch": 0.4416807534411978, + "grad_norm": 0.3068162202835083, + "learning_rate": 4.869369233101217e-05, + "loss": 1.6617, + "step": 1829 + }, + { + "epoch": 0.44192224100458827, + "grad_norm": 0.31325581669807434, + "learning_rate": 4.866346307245027e-05, + "loss": 1.8767, + "step": 1830 + }, + { + "epoch": 0.44216372856797875, + "grad_norm": 0.3100229501724243, + "learning_rate": 4.8633228621500014e-05, + "loss": 1.6149, + "step": 1831 + }, + { + "epoch": 0.44240521613136924, + "grad_norm": 0.3256266415119171, + "learning_rate": 4.8602988996282235e-05, + "loss": 1.7831, + "step": 1832 + }, + { + "epoch": 0.4426467036947597, + "grad_norm": 0.337890088558197, + "learning_rate": 4.857274421492082e-05, + "loss": 1.865, + "step": 1833 + }, + { + "epoch": 0.4428881912581502, + "grad_norm": 0.3197672963142395, + "learning_rate": 4.854249429554281e-05, + "loss": 1.8182, + "step": 1834 + }, + { + "epoch": 0.4431296788215407, + "grad_norm": 0.31269827485084534, + "learning_rate": 4.851223925627826e-05, + "loss": 1.6953, + "step": 1835 + }, + { + "epoch": 0.4433711663849312, + "grad_norm": 0.30737265944480896, + "learning_rate": 4.848197911526034e-05, + "loss": 1.6799, + "step": 1836 + }, + { + "epoch": 0.44361265394832167, + "grad_norm": 0.3163803815841675, + "learning_rate": 4.8451713890625265e-05, + "loss": 1.6822, + "step": 1837 + }, + { + "epoch": 0.44385414151171215, + "grad_norm": 0.3576357662677765, + "learning_rate": 4.842144360051228e-05, + "loss": 1.7801, + "step": 1838 + }, + { + "epoch": 0.44409562907510264, + "grad_norm": 0.3141801059246063, + "learning_rate": 4.839116826306369e-05, + "loss": 1.842, + "step": 1839 + }, + { + "epoch": 0.4443371166384931, + "grad_norm": 0.3117457330226898, + "learning_rate": 4.836088789642482e-05, + "loss": 1.6693, + "step": 1840 + }, + { + "epoch": 0.4445786042018836, + "grad_norm": 0.3110361695289612, + "learning_rate": 4.833060251874399e-05, + "loss": 1.7368, + "step": 1841 + }, + { + "epoch": 0.4448200917652741, + "grad_norm": 0.3443051278591156, + "learning_rate": 4.830031214817253e-05, + "loss": 1.857, + "step": 1842 + }, + { + "epoch": 0.4450615793286646, + "grad_norm": 0.39485305547714233, + "learning_rate": 4.827001680286481e-05, + "loss": 1.8448, + "step": 1843 + }, + { + "epoch": 0.44530306689205507, + "grad_norm": 0.3087663948535919, + "learning_rate": 4.8239716500978106e-05, + "loss": 1.7263, + "step": 1844 + }, + { + "epoch": 0.44554455445544555, + "grad_norm": 0.32222047448158264, + "learning_rate": 4.8209411260672705e-05, + "loss": 1.8257, + "step": 1845 + }, + { + "epoch": 0.44578604201883604, + "grad_norm": 0.322906494140625, + "learning_rate": 4.8179101100111864e-05, + "loss": 1.6751, + "step": 1846 + }, + { + "epoch": 0.4460275295822265, + "grad_norm": 0.3205435872077942, + "learning_rate": 4.8148786037461764e-05, + "loss": 1.8693, + "step": 1847 + }, + { + "epoch": 0.446269017145617, + "grad_norm": 0.31261250376701355, + "learning_rate": 4.811846609089153e-05, + "loss": 1.6956, + "step": 1848 + }, + { + "epoch": 0.4465105047090075, + "grad_norm": 0.3247355818748474, + "learning_rate": 4.808814127857322e-05, + "loss": 1.7054, + "step": 1849 + }, + { + "epoch": 0.446751992272398, + "grad_norm": 0.3224380612373352, + "learning_rate": 4.805781161868182e-05, + "loss": 1.6681, + "step": 1850 + }, + { + "epoch": 0.44699347983578847, + "grad_norm": 0.3073568344116211, + "learning_rate": 4.802747712939518e-05, + "loss": 1.6864, + "step": 1851 + }, + { + "epoch": 0.44723496739917895, + "grad_norm": 0.34604325890541077, + "learning_rate": 4.799713782889409e-05, + "loss": 1.969, + "step": 1852 + }, + { + "epoch": 0.44747645496256944, + "grad_norm": 0.3278951346874237, + "learning_rate": 4.796679373536222e-05, + "loss": 1.6306, + "step": 1853 + }, + { + "epoch": 0.4477179425259599, + "grad_norm": 0.3199866712093353, + "learning_rate": 4.7936444866986066e-05, + "loss": 1.6913, + "step": 1854 + }, + { + "epoch": 0.4479594300893504, + "grad_norm": 0.32705411314964294, + "learning_rate": 4.790609124195506e-05, + "loss": 1.8419, + "step": 1855 + }, + { + "epoch": 0.4482009176527409, + "grad_norm": 0.3279324471950531, + "learning_rate": 4.78757328784614e-05, + "loss": 1.711, + "step": 1856 + }, + { + "epoch": 0.4484424052161314, + "grad_norm": 0.3183402419090271, + "learning_rate": 4.7845369794700185e-05, + "loss": 1.7563, + "step": 1857 + }, + { + "epoch": 0.44868389277952186, + "grad_norm": 0.3299994170665741, + "learning_rate": 4.781500200886934e-05, + "loss": 1.7747, + "step": 1858 + }, + { + "epoch": 0.44892538034291235, + "grad_norm": 0.33904218673706055, + "learning_rate": 4.7784629539169555e-05, + "loss": 1.7146, + "step": 1859 + }, + { + "epoch": 0.44916686790630284, + "grad_norm": 0.31083980202674866, + "learning_rate": 4.7754252403804404e-05, + "loss": 1.6899, + "step": 1860 + }, + { + "epoch": 0.4494083554696933, + "grad_norm": 0.320126473903656, + "learning_rate": 4.7723870620980206e-05, + "loss": 1.65, + "step": 1861 + }, + { + "epoch": 0.4496498430330838, + "grad_norm": 0.322860985994339, + "learning_rate": 4.769348420890607e-05, + "loss": 1.7541, + "step": 1862 + }, + { + "epoch": 0.4498913305964743, + "grad_norm": 0.3172602653503418, + "learning_rate": 4.766309318579391e-05, + "loss": 1.6166, + "step": 1863 + }, + { + "epoch": 0.4501328181598648, + "grad_norm": 0.33034148812294006, + "learning_rate": 4.7632697569858336e-05, + "loss": 1.9764, + "step": 1864 + }, + { + "epoch": 0.45037430572325526, + "grad_norm": 0.3384269177913666, + "learning_rate": 4.760229737931681e-05, + "loss": 1.7827, + "step": 1865 + }, + { + "epoch": 0.45061579328664575, + "grad_norm": 0.3293705880641937, + "learning_rate": 4.7571892632389454e-05, + "loss": 1.7764, + "step": 1866 + }, + { + "epoch": 0.45085728085003623, + "grad_norm": 0.32411205768585205, + "learning_rate": 4.7541483347299154e-05, + "loss": 1.7321, + "step": 1867 + }, + { + "epoch": 0.4510987684134267, + "grad_norm": 0.3312840163707733, + "learning_rate": 4.7511069542271504e-05, + "loss": 1.8471, + "step": 1868 + }, + { + "epoch": 0.4513402559768172, + "grad_norm": 0.33269646763801575, + "learning_rate": 4.748065123553481e-05, + "loss": 1.7057, + "step": 1869 + }, + { + "epoch": 0.4515817435402077, + "grad_norm": 0.32271480560302734, + "learning_rate": 4.74502284453201e-05, + "loss": 1.7683, + "step": 1870 + }, + { + "epoch": 0.4518232311035982, + "grad_norm": 0.32621634006500244, + "learning_rate": 4.7419801189861065e-05, + "loss": 1.9058, + "step": 1871 + }, + { + "epoch": 0.45206471866698866, + "grad_norm": 0.31796547770500183, + "learning_rate": 4.7389369487394046e-05, + "loss": 1.5809, + "step": 1872 + }, + { + "epoch": 0.45230620623037915, + "grad_norm": 0.31985053420066833, + "learning_rate": 4.735893335615812e-05, + "loss": 1.7732, + "step": 1873 + }, + { + "epoch": 0.45254769379376963, + "grad_norm": 0.3129877746105194, + "learning_rate": 4.732849281439495e-05, + "loss": 1.7053, + "step": 1874 + }, + { + "epoch": 0.4527891813571601, + "grad_norm": 0.3248676359653473, + "learning_rate": 4.729804788034887e-05, + "loss": 1.9495, + "step": 1875 + }, + { + "epoch": 0.4530306689205506, + "grad_norm": 0.32636207342147827, + "learning_rate": 4.726759857226688e-05, + "loss": 1.906, + "step": 1876 + }, + { + "epoch": 0.4532721564839411, + "grad_norm": 0.31957873702049255, + "learning_rate": 4.723714490839853e-05, + "loss": 1.7117, + "step": 1877 + }, + { + "epoch": 0.4535136440473316, + "grad_norm": 0.33045974373817444, + "learning_rate": 4.720668690699603e-05, + "loss": 1.7147, + "step": 1878 + }, + { + "epoch": 0.45375513161072206, + "grad_norm": 0.3191014230251312, + "learning_rate": 4.717622458631418e-05, + "loss": 1.6461, + "step": 1879 + }, + { + "epoch": 0.45399661917411255, + "grad_norm": 0.33815374970436096, + "learning_rate": 4.714575796461038e-05, + "loss": 1.7384, + "step": 1880 + }, + { + "epoch": 0.45423810673750303, + "grad_norm": 0.3154662847518921, + "learning_rate": 4.711528706014457e-05, + "loss": 1.8105, + "step": 1881 + }, + { + "epoch": 0.4544795943008935, + "grad_norm": 0.3145321011543274, + "learning_rate": 4.70848118911793e-05, + "loss": 1.6214, + "step": 1882 + }, + { + "epoch": 0.454721081864284, + "grad_norm": 0.3181321322917938, + "learning_rate": 4.705433247597965e-05, + "loss": 1.6951, + "step": 1883 + }, + { + "epoch": 0.4549625694276745, + "grad_norm": 0.338344007730484, + "learning_rate": 4.702384883281325e-05, + "loss": 1.8272, + "step": 1884 + }, + { + "epoch": 0.455204056991065, + "grad_norm": 0.33954915404319763, + "learning_rate": 4.699336097995027e-05, + "loss": 1.9373, + "step": 1885 + }, + { + "epoch": 0.45544554455445546, + "grad_norm": 0.3360753655433655, + "learning_rate": 4.696286893566341e-05, + "loss": 1.5562, + "step": 1886 + }, + { + "epoch": 0.45568703211784595, + "grad_norm": 0.3098269999027252, + "learning_rate": 4.693237271822786e-05, + "loss": 1.6535, + "step": 1887 + }, + { + "epoch": 0.45592851968123643, + "grad_norm": 0.3185242712497711, + "learning_rate": 4.6901872345921326e-05, + "loss": 1.7053, + "step": 1888 + }, + { + "epoch": 0.4561700072446269, + "grad_norm": 0.3227466642856598, + "learning_rate": 4.6871367837024e-05, + "loss": 1.8213, + "step": 1889 + }, + { + "epoch": 0.4564114948080174, + "grad_norm": 0.32636722922325134, + "learning_rate": 4.6840859209818554e-05, + "loss": 1.8187, + "step": 1890 + }, + { + "epoch": 0.4566529823714079, + "grad_norm": 0.318192720413208, + "learning_rate": 4.681034648259014e-05, + "loss": 1.7479, + "step": 1891 + }, + { + "epoch": 0.4568944699347984, + "grad_norm": 0.30496731400489807, + "learning_rate": 4.677982967362633e-05, + "loss": 1.7133, + "step": 1892 + }, + { + "epoch": 0.45713595749818886, + "grad_norm": 0.33690890669822693, + "learning_rate": 4.674930880121719e-05, + "loss": 1.7466, + "step": 1893 + }, + { + "epoch": 0.45737744506157935, + "grad_norm": 0.31268423795700073, + "learning_rate": 4.67187838836552e-05, + "loss": 1.8265, + "step": 1894 + }, + { + "epoch": 0.45761893262496983, + "grad_norm": 0.33327123522758484, + "learning_rate": 4.668825493923525e-05, + "loss": 1.9799, + "step": 1895 + }, + { + "epoch": 0.4578604201883603, + "grad_norm": 0.30676886439323425, + "learning_rate": 4.6657721986254674e-05, + "loss": 1.6389, + "step": 1896 + }, + { + "epoch": 0.4581019077517508, + "grad_norm": 0.3276241719722748, + "learning_rate": 4.6627185043013165e-05, + "loss": 1.7445, + "step": 1897 + }, + { + "epoch": 0.4583433953151413, + "grad_norm": 0.30695146322250366, + "learning_rate": 4.659664412781286e-05, + "loss": 1.6091, + "step": 1898 + }, + { + "epoch": 0.4585848828785318, + "grad_norm": 0.31264829635620117, + "learning_rate": 4.656609925895826e-05, + "loss": 1.7049, + "step": 1899 + }, + { + "epoch": 0.45882637044192226, + "grad_norm": 0.32918858528137207, + "learning_rate": 4.65355504547562e-05, + "loss": 1.795, + "step": 1900 + }, + { + "epoch": 0.45906785800531275, + "grad_norm": 0.31754815578460693, + "learning_rate": 4.6504997733515904e-05, + "loss": 1.7422, + "step": 1901 + }, + { + "epoch": 0.45930934556870323, + "grad_norm": 0.33143150806427, + "learning_rate": 4.6474441113548957e-05, + "loss": 1.8414, + "step": 1902 + }, + { + "epoch": 0.4595508331320937, + "grad_norm": 0.31607118248939514, + "learning_rate": 4.6443880613169254e-05, + "loss": 1.6834, + "step": 1903 + }, + { + "epoch": 0.4597923206954842, + "grad_norm": 0.3158678114414215, + "learning_rate": 4.641331625069302e-05, + "loss": 1.6669, + "step": 1904 + }, + { + "epoch": 0.4600338082588747, + "grad_norm": 0.3216167688369751, + "learning_rate": 4.6382748044438815e-05, + "loss": 1.7106, + "step": 1905 + }, + { + "epoch": 0.4602752958222652, + "grad_norm": 0.38955986499786377, + "learning_rate": 4.6352176012727484e-05, + "loss": 1.788, + "step": 1906 + }, + { + "epoch": 0.46051678338565566, + "grad_norm": 0.3093554675579071, + "learning_rate": 4.632160017388215e-05, + "loss": 1.663, + "step": 1907 + }, + { + "epoch": 0.46075827094904614, + "grad_norm": 0.32816994190216064, + "learning_rate": 4.629102054622825e-05, + "loss": 1.7601, + "step": 1908 + }, + { + "epoch": 0.46099975851243663, + "grad_norm": 0.3421451449394226, + "learning_rate": 4.626043714809348e-05, + "loss": 1.7477, + "step": 1909 + }, + { + "epoch": 0.4612412460758271, + "grad_norm": 0.3192618191242218, + "learning_rate": 4.622984999780779e-05, + "loss": 1.6711, + "step": 1910 + }, + { + "epoch": 0.4614827336392176, + "grad_norm": 0.3109111785888672, + "learning_rate": 4.61992591137034e-05, + "loss": 1.6517, + "step": 1911 + }, + { + "epoch": 0.4617242212026081, + "grad_norm": 0.3304436504840851, + "learning_rate": 4.6168664514114723e-05, + "loss": 1.7932, + "step": 1912 + }, + { + "epoch": 0.4619657087659986, + "grad_norm": 0.3186758756637573, + "learning_rate": 4.613806621737844e-05, + "loss": 1.7554, + "step": 1913 + }, + { + "epoch": 0.46220719632938906, + "grad_norm": 0.31981173157691956, + "learning_rate": 4.6107464241833436e-05, + "loss": 1.7032, + "step": 1914 + }, + { + "epoch": 0.46244868389277954, + "grad_norm": 0.3194178342819214, + "learning_rate": 4.6076858605820804e-05, + "loss": 1.6827, + "step": 1915 + }, + { + "epoch": 0.46269017145617003, + "grad_norm": 0.34643322229385376, + "learning_rate": 4.604624932768382e-05, + "loss": 2.0343, + "step": 1916 + }, + { + "epoch": 0.4629316590195605, + "grad_norm": 0.32240161299705505, + "learning_rate": 4.6015636425767933e-05, + "loss": 1.7716, + "step": 1917 + }, + { + "epoch": 0.463173146582951, + "grad_norm": 0.3118249773979187, + "learning_rate": 4.59850199184208e-05, + "loss": 1.7507, + "step": 1918 + }, + { + "epoch": 0.4634146341463415, + "grad_norm": 0.32204747200012207, + "learning_rate": 4.595439982399222e-05, + "loss": 1.6723, + "step": 1919 + }, + { + "epoch": 0.46365612170973197, + "grad_norm": 0.3252248764038086, + "learning_rate": 4.592377616083413e-05, + "loss": 1.8006, + "step": 1920 + }, + { + "epoch": 0.46389760927312246, + "grad_norm": 0.3427707552909851, + "learning_rate": 4.5893148947300636e-05, + "loss": 1.8713, + "step": 1921 + }, + { + "epoch": 0.46413909683651294, + "grad_norm": 0.3312002122402191, + "learning_rate": 4.5862518201747926e-05, + "loss": 1.7791, + "step": 1922 + }, + { + "epoch": 0.46438058439990343, + "grad_norm": 0.3222915828227997, + "learning_rate": 4.5831883942534344e-05, + "loss": 1.7691, + "step": 1923 + }, + { + "epoch": 0.4646220719632939, + "grad_norm": 0.3127139210700989, + "learning_rate": 4.580124618802034e-05, + "loss": 1.7361, + "step": 1924 + }, + { + "epoch": 0.4648635595266844, + "grad_norm": 0.30985063314437866, + "learning_rate": 4.577060495656842e-05, + "loss": 1.706, + "step": 1925 + }, + { + "epoch": 0.4651050470900749, + "grad_norm": 0.3158462643623352, + "learning_rate": 4.573996026654321e-05, + "loss": 1.7321, + "step": 1926 + }, + { + "epoch": 0.46534653465346537, + "grad_norm": 0.3284815847873688, + "learning_rate": 4.570931213631141e-05, + "loss": 1.6042, + "step": 1927 + }, + { + "epoch": 0.46558802221685586, + "grad_norm": 0.3247036039829254, + "learning_rate": 4.567866058424176e-05, + "loss": 1.6458, + "step": 1928 + }, + { + "epoch": 0.46582950978024634, + "grad_norm": 0.31772297620773315, + "learning_rate": 4.564800562870506e-05, + "loss": 1.7685, + "step": 1929 + }, + { + "epoch": 0.46607099734363683, + "grad_norm": 0.3419104218482971, + "learning_rate": 4.561734728807417e-05, + "loss": 1.9509, + "step": 1930 + }, + { + "epoch": 0.4663124849070273, + "grad_norm": 0.3184857964515686, + "learning_rate": 4.558668558072393e-05, + "loss": 1.6747, + "step": 1931 + }, + { + "epoch": 0.4665539724704178, + "grad_norm": 0.3354939818382263, + "learning_rate": 4.555602052503126e-05, + "loss": 1.8638, + "step": 1932 + }, + { + "epoch": 0.4667954600338083, + "grad_norm": 0.3130846619606018, + "learning_rate": 4.5525352139375035e-05, + "loss": 1.716, + "step": 1933 + }, + { + "epoch": 0.46703694759719877, + "grad_norm": 0.3140762150287628, + "learning_rate": 4.5494680442136144e-05, + "loss": 1.7392, + "step": 1934 + }, + { + "epoch": 0.46727843516058926, + "grad_norm": 0.32126384973526, + "learning_rate": 4.546400545169748e-05, + "loss": 1.879, + "step": 1935 + }, + { + "epoch": 0.46751992272397974, + "grad_norm": 0.31407633423805237, + "learning_rate": 4.543332718644388e-05, + "loss": 1.631, + "step": 1936 + }, + { + "epoch": 0.4677614102873702, + "grad_norm": 0.3271917402744293, + "learning_rate": 4.5402645664762144e-05, + "loss": 1.7332, + "step": 1937 + }, + { + "epoch": 0.4680028978507607, + "grad_norm": 0.3262588381767273, + "learning_rate": 4.5371960905041066e-05, + "loss": 1.7904, + "step": 1938 + }, + { + "epoch": 0.4682443854141512, + "grad_norm": 0.3321874439716339, + "learning_rate": 4.534127292567133e-05, + "loss": 1.8836, + "step": 1939 + }, + { + "epoch": 0.4684858729775417, + "grad_norm": 0.32539454102516174, + "learning_rate": 4.531058174504557e-05, + "loss": 1.8183, + "step": 1940 + }, + { + "epoch": 0.46872736054093217, + "grad_norm": 0.31996139883995056, + "learning_rate": 4.5279887381558335e-05, + "loss": 1.8423, + "step": 1941 + }, + { + "epoch": 0.4689688481043226, + "grad_norm": 0.2960781753063202, + "learning_rate": 4.524918985360611e-05, + "loss": 1.5413, + "step": 1942 + }, + { + "epoch": 0.4692103356677131, + "grad_norm": 0.33326393365859985, + "learning_rate": 4.521848917958721e-05, + "loss": 1.7277, + "step": 1943 + }, + { + "epoch": 0.46945182323110357, + "grad_norm": 0.30825114250183105, + "learning_rate": 4.518778537790193e-05, + "loss": 1.5946, + "step": 1944 + }, + { + "epoch": 0.46969331079449406, + "grad_norm": 0.3104898929595947, + "learning_rate": 4.515707846695235e-05, + "loss": 1.5605, + "step": 1945 + }, + { + "epoch": 0.46993479835788454, + "grad_norm": 0.3065233826637268, + "learning_rate": 4.512636846514245e-05, + "loss": 1.6081, + "step": 1946 + }, + { + "epoch": 0.47017628592127503, + "grad_norm": 0.32400989532470703, + "learning_rate": 4.509565539087809e-05, + "loss": 1.7397, + "step": 1947 + }, + { + "epoch": 0.4704177734846655, + "grad_norm": 0.31074362993240356, + "learning_rate": 4.506493926256692e-05, + "loss": 1.7263, + "step": 1948 + }, + { + "epoch": 0.470659261048056, + "grad_norm": 0.3119424283504486, + "learning_rate": 4.5034220098618445e-05, + "loss": 1.6285, + "step": 1949 + }, + { + "epoch": 0.4709007486114465, + "grad_norm": 0.3202967345714569, + "learning_rate": 4.500349791744401e-05, + "loss": 1.6423, + "step": 1950 + }, + { + "epoch": 0.47114223617483697, + "grad_norm": 0.3224698603153229, + "learning_rate": 4.4972772737456734e-05, + "loss": 1.8148, + "step": 1951 + }, + { + "epoch": 0.47138372373822746, + "grad_norm": 0.3153221607208252, + "learning_rate": 4.494204457707153e-05, + "loss": 1.6917, + "step": 1952 + }, + { + "epoch": 0.47162521130161794, + "grad_norm": 0.32202938199043274, + "learning_rate": 4.4911313454705155e-05, + "loss": 1.8316, + "step": 1953 + }, + { + "epoch": 0.4718666988650084, + "grad_norm": 0.330608606338501, + "learning_rate": 4.488057938877607e-05, + "loss": 1.7924, + "step": 1954 + }, + { + "epoch": 0.4721081864283989, + "grad_norm": 0.32101622223854065, + "learning_rate": 4.484984239770454e-05, + "loss": 1.7442, + "step": 1955 + }, + { + "epoch": 0.4723496739917894, + "grad_norm": 0.3142457604408264, + "learning_rate": 4.4819102499912575e-05, + "loss": 1.6354, + "step": 1956 + }, + { + "epoch": 0.4725911615551799, + "grad_norm": 0.3051566183567047, + "learning_rate": 4.478835971382392e-05, + "loss": 1.6723, + "step": 1957 + }, + { + "epoch": 0.47283264911857037, + "grad_norm": 0.31328076124191284, + "learning_rate": 4.475761405786407e-05, + "loss": 1.6896, + "step": 1958 + }, + { + "epoch": 0.47307413668196086, + "grad_norm": 0.3216973841190338, + "learning_rate": 4.4726865550460215e-05, + "loss": 1.7345, + "step": 1959 + }, + { + "epoch": 0.47331562424535134, + "grad_norm": 0.3146194517612457, + "learning_rate": 4.469611421004126e-05, + "loss": 1.6428, + "step": 1960 + }, + { + "epoch": 0.4735571118087418, + "grad_norm": 0.33474940061569214, + "learning_rate": 4.4665360055037834e-05, + "loss": 1.7699, + "step": 1961 + }, + { + "epoch": 0.4737985993721323, + "grad_norm": 0.30783769488334656, + "learning_rate": 4.463460310388222e-05, + "loss": 1.6049, + "step": 1962 + }, + { + "epoch": 0.4740400869355228, + "grad_norm": 0.3315912187099457, + "learning_rate": 4.4603843375008387e-05, + "loss": 1.7062, + "step": 1963 + }, + { + "epoch": 0.4742815744989133, + "grad_norm": 0.33379220962524414, + "learning_rate": 4.457308088685197e-05, + "loss": 1.8349, + "step": 1964 + }, + { + "epoch": 0.47452306206230377, + "grad_norm": 0.29385891556739807, + "learning_rate": 4.454231565785029e-05, + "loss": 1.5972, + "step": 1965 + }, + { + "epoch": 0.47476454962569425, + "grad_norm": 0.33387261629104614, + "learning_rate": 4.451154770644224e-05, + "loss": 1.8021, + "step": 1966 + }, + { + "epoch": 0.47500603718908474, + "grad_norm": 0.346824049949646, + "learning_rate": 4.4480777051068416e-05, + "loss": 1.7912, + "step": 1967 + }, + { + "epoch": 0.4752475247524752, + "grad_norm": 0.3210572302341461, + "learning_rate": 4.445000371017099e-05, + "loss": 1.7741, + "step": 1968 + }, + { + "epoch": 0.4754890123158657, + "grad_norm": 0.3143101632595062, + "learning_rate": 4.441922770219374e-05, + "loss": 1.5724, + "step": 1969 + }, + { + "epoch": 0.4757304998792562, + "grad_norm": 0.3186543881893158, + "learning_rate": 4.4388449045582086e-05, + "loss": 1.6874, + "step": 1970 + }, + { + "epoch": 0.4759719874426467, + "grad_norm": 0.3205025792121887, + "learning_rate": 4.4357667758783e-05, + "loss": 1.5621, + "step": 1971 + }, + { + "epoch": 0.47621347500603717, + "grad_norm": 0.3176744282245636, + "learning_rate": 4.432688386024503e-05, + "loss": 1.8236, + "step": 1972 + }, + { + "epoch": 0.47645496256942765, + "grad_norm": 0.33443495631217957, + "learning_rate": 4.429609736841832e-05, + "loss": 1.9467, + "step": 1973 + }, + { + "epoch": 0.47669645013281814, + "grad_norm": 0.3172236680984497, + "learning_rate": 4.426530830175452e-05, + "loss": 1.776, + "step": 1974 + }, + { + "epoch": 0.4769379376962086, + "grad_norm": 0.3080536425113678, + "learning_rate": 4.423451667870686e-05, + "loss": 1.6937, + "step": 1975 + }, + { + "epoch": 0.4771794252595991, + "grad_norm": 0.31537625193595886, + "learning_rate": 4.4203722517730104e-05, + "loss": 1.6426, + "step": 1976 + }, + { + "epoch": 0.4774209128229896, + "grad_norm": 0.30593976378440857, + "learning_rate": 4.417292583728053e-05, + "loss": 1.663, + "step": 1977 + }, + { + "epoch": 0.4776624003863801, + "grad_norm": 0.3199318051338196, + "learning_rate": 4.4142126655815886e-05, + "loss": 1.7582, + "step": 1978 + }, + { + "epoch": 0.47790388794977057, + "grad_norm": 0.3328000009059906, + "learning_rate": 4.411132499179549e-05, + "loss": 1.7726, + "step": 1979 + }, + { + "epoch": 0.47814537551316105, + "grad_norm": 0.31644874811172485, + "learning_rate": 4.4080520863680106e-05, + "loss": 1.7679, + "step": 1980 + }, + { + "epoch": 0.47838686307655154, + "grad_norm": 0.3406371474266052, + "learning_rate": 4.4049714289931956e-05, + "loss": 1.9363, + "step": 1981 + }, + { + "epoch": 0.478628350639942, + "grad_norm": 0.3192148804664612, + "learning_rate": 4.401890528901479e-05, + "loss": 1.7492, + "step": 1982 + }, + { + "epoch": 0.4788698382033325, + "grad_norm": 0.3432200849056244, + "learning_rate": 4.3988093879393754e-05, + "loss": 1.7355, + "step": 1983 + }, + { + "epoch": 0.479111325766723, + "grad_norm": 0.30041298270225525, + "learning_rate": 4.395728007953545e-05, + "loss": 1.7963, + "step": 1984 + }, + { + "epoch": 0.4793528133301135, + "grad_norm": 0.2945508360862732, + "learning_rate": 4.392646390790794e-05, + "loss": 1.5881, + "step": 1985 + }, + { + "epoch": 0.47959430089350397, + "grad_norm": 0.3067844808101654, + "learning_rate": 4.389564538298068e-05, + "loss": 1.677, + "step": 1986 + }, + { + "epoch": 0.47983578845689445, + "grad_norm": 0.29964399337768555, + "learning_rate": 4.386482452322456e-05, + "loss": 1.4658, + "step": 1987 + }, + { + "epoch": 0.48007727602028494, + "grad_norm": 0.3236359655857086, + "learning_rate": 4.383400134711183e-05, + "loss": 1.652, + "step": 1988 + }, + { + "epoch": 0.4803187635836754, + "grad_norm": 0.30299097299575806, + "learning_rate": 4.380317587311618e-05, + "loss": 1.6701, + "step": 1989 + }, + { + "epoch": 0.4805602511470659, + "grad_norm": 0.3327222466468811, + "learning_rate": 4.377234811971263e-05, + "loss": 1.6186, + "step": 1990 + }, + { + "epoch": 0.4808017387104564, + "grad_norm": 0.3213178217411041, + "learning_rate": 4.374151810537759e-05, + "loss": 1.6802, + "step": 1991 + }, + { + "epoch": 0.4810432262738469, + "grad_norm": 0.3151525855064392, + "learning_rate": 4.3710685848588846e-05, + "loss": 1.7172, + "step": 1992 + }, + { + "epoch": 0.48128471383723737, + "grad_norm": 0.31488415598869324, + "learning_rate": 4.367985136782547e-05, + "loss": 1.6706, + "step": 1993 + }, + { + "epoch": 0.48152620140062785, + "grad_norm": 0.34251371026039124, + "learning_rate": 4.3649014681567914e-05, + "loss": 1.9582, + "step": 1994 + }, + { + "epoch": 0.48176768896401834, + "grad_norm": 0.3280927240848541, + "learning_rate": 4.361817580829795e-05, + "loss": 1.7852, + "step": 1995 + }, + { + "epoch": 0.4820091765274088, + "grad_norm": 0.32400888204574585, + "learning_rate": 4.358733476649863e-05, + "loss": 1.6627, + "step": 1996 + }, + { + "epoch": 0.4822506640907993, + "grad_norm": 0.3338795304298401, + "learning_rate": 4.3556491574654335e-05, + "loss": 1.7898, + "step": 1997 + }, + { + "epoch": 0.4824921516541898, + "grad_norm": 0.3094484508037567, + "learning_rate": 4.352564625125073e-05, + "loss": 1.804, + "step": 1998 + }, + { + "epoch": 0.4827336392175803, + "grad_norm": 0.312665730714798, + "learning_rate": 4.349479881477473e-05, + "loss": 1.6702, + "step": 1999 + }, + { + "epoch": 0.48297512678097076, + "grad_norm": 0.3298127055168152, + "learning_rate": 4.3463949283714577e-05, + "loss": 1.7842, + "step": 2000 + } + ], + "logging_steps": 1.0, + "max_steps": 4141, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.017027157491712e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}