diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,27774 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9994323557237466, + "eval_steps": 500, + "global_step": 3963, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0007568590350047304, + "grad_norm": 123.05032348632812, + "learning_rate": 0.0, + "loss": 2.0667, + "step": 1 + }, + { + "epoch": 0.0015137180700094607, + "grad_norm": 59.42024612426758, + "learning_rate": 3.5436764027111585e-06, + "loss": 1.9891, + "step": 2 + }, + { + "epoch": 0.002270577105014191, + "grad_norm": 74.41653442382812, + "learning_rate": 5.61659421298763e-06, + "loss": 1.9745, + "step": 3 + }, + { + "epoch": 0.0030274361400189215, + "grad_norm": 65.0828857421875, + "learning_rate": 7.087352805422317e-06, + "loss": 1.9442, + "step": 4 + }, + { + "epoch": 0.003784295175023652, + "grad_norm": 77.46288299560547, + "learning_rate": 8.228161798644422e-06, + "loss": 1.911, + "step": 5 + }, + { + "epoch": 0.004541154210028382, + "grad_norm": 140.7876434326172, + "learning_rate": 9.160270615698787e-06, + "loss": 1.9, + "step": 6 + }, + { + "epoch": 0.005298013245033113, + "grad_norm": 39.34813690185547, + "learning_rate": 9.948357391330555e-06, + "loss": 1.8421, + "step": 7 + }, + { + "epoch": 0.006054872280037843, + "grad_norm": 42.30311584472656, + "learning_rate": 1.0631029208133474e-05, + "loss": 1.8634, + "step": 8 + }, + { + "epoch": 0.006811731315042573, + "grad_norm": 56.799530029296875, + "learning_rate": 1.123318842597526e-05, + "loss": 1.8391, + "step": 9 + }, + { + "epoch": 0.007568590350047304, + "grad_norm": 74.55519104003906, + "learning_rate": 1.1771838201355582e-05, + "loss": 1.7531, + "step": 10 + }, + { + "epoch": 0.008325449385052034, + "grad_norm": 31.099952697753906, + "learning_rate": 1.2259106193757859e-05, + "loss": 1.7675, + "step": 11 + }, + { + "epoch": 0.009082308420056764, + "grad_norm": 49.344966888427734, + "learning_rate": 1.2703947018409945e-05, + "loss": 1.7896, + "step": 12 + }, + { + "epoch": 0.009839167455061495, + "grad_norm": 48.00835418701172, + "learning_rate": 1.311316090883172e-05, + "loss": 1.8585, + "step": 13 + }, + { + "epoch": 0.010596026490066225, + "grad_norm": 38.080078125, + "learning_rate": 1.3492033794041713e-05, + "loss": 1.7329, + "step": 14 + }, + { + "epoch": 0.011352885525070956, + "grad_norm": 45.796382904052734, + "learning_rate": 1.384475601163205e-05, + "loss": 1.8033, + "step": 15 + }, + { + "epoch": 0.012109744560075686, + "grad_norm": 35.87776184082031, + "learning_rate": 1.4174705610844634e-05, + "loss": 1.7784, + "step": 16 + }, + { + "epoch": 0.012866603595080416, + "grad_norm": 25.678325653076172, + "learning_rate": 1.4484645617497535e-05, + "loss": 1.7741, + "step": 17 + }, + { + "epoch": 0.013623462630085147, + "grad_norm": 28.66301918029785, + "learning_rate": 1.4776864828686414e-05, + "loss": 1.7957, + "step": 18 + }, + { + "epoch": 0.014380321665089877, + "grad_norm": 37.723976135253906, + "learning_rate": 1.505328048981752e-05, + "loss": 1.695, + "step": 19 + }, + { + "epoch": 0.015137180700094607, + "grad_norm": 36.247718811035156, + "learning_rate": 1.5315514604066738e-05, + "loss": 1.7189, + "step": 20 + }, + { + "epoch": 0.015894039735099338, + "grad_norm": 23.032033920288086, + "learning_rate": 1.5564951604318184e-05, + "loss": 1.6817, + "step": 21 + }, + { + "epoch": 0.016650898770104068, + "grad_norm": 28.1435489654541, + "learning_rate": 1.580278259646902e-05, + "loss": 1.7185, + "step": 22 + }, + { + "epoch": 0.0174077578051088, + "grad_norm": 75.195068359375, + "learning_rate": 1.603003975988117e-05, + "loss": 1.7109, + "step": 23 + }, + { + "epoch": 0.01816461684011353, + "grad_norm": 30.104032516479492, + "learning_rate": 1.6247623421121105e-05, + "loss": 1.6333, + "step": 24 + }, + { + "epoch": 0.01892147587511826, + "grad_norm": 24.25992774963379, + "learning_rate": 1.6456323597288844e-05, + "loss": 1.6416, + "step": 25 + }, + { + "epoch": 0.01967833491012299, + "grad_norm": 28.712949752807617, + "learning_rate": 1.6656837311542876e-05, + "loss": 1.6712, + "step": 26 + }, + { + "epoch": 0.02043519394512772, + "grad_norm": 26.72446060180664, + "learning_rate": 1.6849782638962885e-05, + "loss": 1.5939, + "step": 27 + }, + { + "epoch": 0.02119205298013245, + "grad_norm": 20.644784927368164, + "learning_rate": 1.7035710196752873e-05, + "loss": 1.6718, + "step": 28 + }, + { + "epoch": 0.02194891201513718, + "grad_norm": 24.643821716308594, + "learning_rate": 1.7215112617252848e-05, + "loss": 1.6778, + "step": 29 + }, + { + "epoch": 0.02270577105014191, + "grad_norm": 22.256315231323242, + "learning_rate": 1.738843241434321e-05, + "loss": 1.6233, + "step": 30 + }, + { + "epoch": 0.02346263008514664, + "grad_norm": 24.24241065979004, + "learning_rate": 1.7556068559516658e-05, + "loss": 1.6744, + "step": 31 + }, + { + "epoch": 0.024219489120151372, + "grad_norm": 37.310150146484375, + "learning_rate": 1.7718382013555794e-05, + "loss": 1.6556, + "step": 32 + }, + { + "epoch": 0.024976348155156102, + "grad_norm": 48.23684310913086, + "learning_rate": 1.7875700406745488e-05, + "loss": 1.6575, + "step": 33 + }, + { + "epoch": 0.025733207190160833, + "grad_norm": 24.166748046875, + "learning_rate": 1.8028322020208693e-05, + "loss": 1.6946, + "step": 34 + }, + { + "epoch": 0.026490066225165563, + "grad_norm": 29.902538299560547, + "learning_rate": 1.817651918997498e-05, + "loss": 1.6453, + "step": 35 + }, + { + "epoch": 0.027246925260170293, + "grad_norm": 27.081722259521484, + "learning_rate": 1.8320541231397574e-05, + "loss": 1.5933, + "step": 36 + }, + { + "epoch": 0.028003784295175024, + "grad_norm": 28.04783058166504, + "learning_rate": 1.8460616962803535e-05, + "loss": 1.676, + "step": 37 + }, + { + "epoch": 0.028760643330179754, + "grad_norm": 49.34148406982422, + "learning_rate": 1.859695689252868e-05, + "loss": 1.7431, + "step": 38 + }, + { + "epoch": 0.029517502365184484, + "grad_norm": 32.92803192138672, + "learning_rate": 1.872975512181935e-05, + "loss": 1.7004, + "step": 39 + }, + { + "epoch": 0.030274361400189215, + "grad_norm": 28.5103816986084, + "learning_rate": 1.8859191006777896e-05, + "loss": 1.646, + "step": 40 + }, + { + "epoch": 0.031031220435193945, + "grad_norm": 16.540956497192383, + "learning_rate": 1.8985430615062968e-05, + "loss": 1.6601, + "step": 41 + }, + { + "epoch": 0.031788079470198675, + "grad_norm": 26.588886260986328, + "learning_rate": 1.9108628007029345e-05, + "loss": 1.6273, + "step": 42 + }, + { + "epoch": 0.03254493850520341, + "grad_norm": 42.97163009643555, + "learning_rate": 1.9228926366101076e-05, + "loss": 1.5573, + "step": 43 + }, + { + "epoch": 0.033301797540208136, + "grad_norm": 40.343658447265625, + "learning_rate": 1.9346458999180177e-05, + "loss": 1.5821, + "step": 44 + }, + { + "epoch": 0.03405865657521287, + "grad_norm": 41.81525421142578, + "learning_rate": 1.946135022461968e-05, + "loss": 1.5927, + "step": 45 + }, + { + "epoch": 0.0348155156102176, + "grad_norm": 24.463436126708984, + "learning_rate": 1.9573716162592327e-05, + "loss": 1.6377, + "step": 46 + }, + { + "epoch": 0.03557237464522233, + "grad_norm": 21.16547203063965, + "learning_rate": 1.9683665440452515e-05, + "loss": 1.6151, + "step": 47 + }, + { + "epoch": 0.03632923368022706, + "grad_norm": 75.09718322753906, + "learning_rate": 1.9791299823832263e-05, + "loss": 1.6261, + "step": 48 + }, + { + "epoch": 0.03708609271523179, + "grad_norm": 30.04339027404785, + "learning_rate": 1.989671478266111e-05, + "loss": 1.6229, + "step": 49 + }, + { + "epoch": 0.03784295175023652, + "grad_norm": 100.00825500488281, + "learning_rate": 2e-05, + "loss": 1.6116, + "step": 50 + }, + { + "epoch": 0.03859981078524125, + "grad_norm": 28.68238639831543, + "learning_rate": 1.9999996777398546e-05, + "loss": 1.6644, + "step": 51 + }, + { + "epoch": 0.03935666982024598, + "grad_norm": 68.90675354003906, + "learning_rate": 1.9999987109596254e-05, + "loss": 1.6159, + "step": 52 + }, + { + "epoch": 0.04011352885525071, + "grad_norm": 81.32110595703125, + "learning_rate": 1.999997099659936e-05, + "loss": 1.7117, + "step": 53 + }, + { + "epoch": 0.04087038789025544, + "grad_norm": 171.38938903808594, + "learning_rate": 1.999994843841825e-05, + "loss": 1.5922, + "step": 54 + }, + { + "epoch": 0.041627246925260174, + "grad_norm": 43.32768249511719, + "learning_rate": 1.9999919435067465e-05, + "loss": 1.6245, + "step": 55 + }, + { + "epoch": 0.0423841059602649, + "grad_norm": 74.8743896484375, + "learning_rate": 1.9999883986565696e-05, + "loss": 1.6613, + "step": 56 + }, + { + "epoch": 0.043140964995269634, + "grad_norm": 18.4515438079834, + "learning_rate": 1.9999842092935797e-05, + "loss": 1.6048, + "step": 57 + }, + { + "epoch": 0.04389782403027436, + "grad_norm": 22.151155471801758, + "learning_rate": 1.999979375420477e-05, + "loss": 1.632, + "step": 58 + }, + { + "epoch": 0.044654683065279095, + "grad_norm": 19.156835556030273, + "learning_rate": 1.9999738970403775e-05, + "loss": 1.6189, + "step": 59 + }, + { + "epoch": 0.04541154210028382, + "grad_norm": 18.256269454956055, + "learning_rate": 1.999967774156812e-05, + "loss": 1.6306, + "step": 60 + }, + { + "epoch": 0.046168401135288556, + "grad_norm": 26.946420669555664, + "learning_rate": 1.999961006773728e-05, + "loss": 1.6411, + "step": 61 + }, + { + "epoch": 0.04692526017029328, + "grad_norm": 23.500892639160156, + "learning_rate": 1.999953594895487e-05, + "loss": 1.5876, + "step": 62 + }, + { + "epoch": 0.04768211920529802, + "grad_norm": 12.633379936218262, + "learning_rate": 1.999945538526867e-05, + "loss": 1.601, + "step": 63 + }, + { + "epoch": 0.048438978240302744, + "grad_norm": 15.04751205444336, + "learning_rate": 1.999936837673061e-05, + "loss": 1.6078, + "step": 64 + }, + { + "epoch": 0.04919583727530748, + "grad_norm": 11.406414985656738, + "learning_rate": 1.999927492339677e-05, + "loss": 1.5959, + "step": 65 + }, + { + "epoch": 0.049952696310312204, + "grad_norm": 11.040087699890137, + "learning_rate": 1.9999175025327395e-05, + "loss": 1.6025, + "step": 66 + }, + { + "epoch": 0.05070955534531694, + "grad_norm": 10.689179420471191, + "learning_rate": 1.999906868258687e-05, + "loss": 1.5797, + "step": 67 + }, + { + "epoch": 0.051466414380321665, + "grad_norm": 8.19336986541748, + "learning_rate": 1.9998955895243748e-05, + "loss": 1.5564, + "step": 68 + }, + { + "epoch": 0.0522232734153264, + "grad_norm": 13.38058853149414, + "learning_rate": 1.9998836663370726e-05, + "loss": 1.5584, + "step": 69 + }, + { + "epoch": 0.052980132450331126, + "grad_norm": 7.053563594818115, + "learning_rate": 1.9998710987044664e-05, + "loss": 1.5005, + "step": 70 + }, + { + "epoch": 0.05373699148533586, + "grad_norm": 6.7353105545043945, + "learning_rate": 1.9998578866346564e-05, + "loss": 1.5306, + "step": 71 + }, + { + "epoch": 0.054493850520340587, + "grad_norm": 5.975197792053223, + "learning_rate": 1.9998440301361598e-05, + "loss": 1.5885, + "step": 72 + }, + { + "epoch": 0.05525070955534532, + "grad_norm": 6.6494011878967285, + "learning_rate": 1.9998295292179073e-05, + "loss": 1.532, + "step": 73 + }, + { + "epoch": 0.05600756859035005, + "grad_norm": 5.434142589569092, + "learning_rate": 1.9998143838892468e-05, + "loss": 1.5764, + "step": 74 + }, + { + "epoch": 0.05676442762535478, + "grad_norm": 6.099053859710693, + "learning_rate": 1.99979859415994e-05, + "loss": 1.5187, + "step": 75 + }, + { + "epoch": 0.05752128666035951, + "grad_norm": 5.7470855712890625, + "learning_rate": 1.999782160040166e-05, + "loss": 1.5377, + "step": 76 + }, + { + "epoch": 0.05827814569536424, + "grad_norm": 5.577144145965576, + "learning_rate": 1.9997650815405167e-05, + "loss": 1.4817, + "step": 77 + }, + { + "epoch": 0.05903500473036897, + "grad_norm": 5.294313907623291, + "learning_rate": 1.999747358672001e-05, + "loss": 1.4812, + "step": 78 + }, + { + "epoch": 0.0597918637653737, + "grad_norm": 5.860252380371094, + "learning_rate": 1.9997289914460428e-05, + "loss": 1.524, + "step": 79 + }, + { + "epoch": 0.06054872280037843, + "grad_norm": 9.259795188903809, + "learning_rate": 1.9997099798744815e-05, + "loss": 1.4869, + "step": 80 + }, + { + "epoch": 0.06130558183538316, + "grad_norm": 5.9855852127075195, + "learning_rate": 1.999690323969571e-05, + "loss": 1.5187, + "step": 81 + }, + { + "epoch": 0.06206244087038789, + "grad_norm": 6.138685703277588, + "learning_rate": 1.9996700237439823e-05, + "loss": 1.5468, + "step": 82 + }, + { + "epoch": 0.06281929990539262, + "grad_norm": 6.596303939819336, + "learning_rate": 1.9996490792107997e-05, + "loss": 1.4899, + "step": 83 + }, + { + "epoch": 0.06357615894039735, + "grad_norm": 5.794029712677002, + "learning_rate": 1.9996274903835247e-05, + "loss": 1.4633, + "step": 84 + }, + { + "epoch": 0.06433301797540208, + "grad_norm": 5.815922260284424, + "learning_rate": 1.9996052572760723e-05, + "loss": 1.4968, + "step": 85 + }, + { + "epoch": 0.06508987701040682, + "grad_norm": 11.239161491394043, + "learning_rate": 1.9995823799027737e-05, + "loss": 1.4604, + "step": 86 + }, + { + "epoch": 0.06584673604541154, + "grad_norm": 5.367482662200928, + "learning_rate": 1.9995588582783753e-05, + "loss": 1.518, + "step": 87 + }, + { + "epoch": 0.06660359508041627, + "grad_norm": 19.351980209350586, + "learning_rate": 1.9995346924180394e-05, + "loss": 1.5267, + "step": 88 + }, + { + "epoch": 0.067360454115421, + "grad_norm": 6.7129316329956055, + "learning_rate": 1.999509882337342e-05, + "loss": 1.4639, + "step": 89 + }, + { + "epoch": 0.06811731315042574, + "grad_norm": 6.777989387512207, + "learning_rate": 1.999484428052276e-05, + "loss": 1.5127, + "step": 90 + }, + { + "epoch": 0.06887417218543046, + "grad_norm": 14.996123313903809, + "learning_rate": 1.9994583295792487e-05, + "loss": 1.5277, + "step": 91 + }, + { + "epoch": 0.0696310312204352, + "grad_norm": 6.6374311447143555, + "learning_rate": 1.9994315869350826e-05, + "loss": 1.4834, + "step": 92 + }, + { + "epoch": 0.07038789025543993, + "grad_norm": 11.19003963470459, + "learning_rate": 1.9994042001370154e-05, + "loss": 1.5084, + "step": 93 + }, + { + "epoch": 0.07114474929044466, + "grad_norm": 6.2547407150268555, + "learning_rate": 1.9993761692027007e-05, + "loss": 1.485, + "step": 94 + }, + { + "epoch": 0.07190160832544938, + "grad_norm": 6.645302772521973, + "learning_rate": 1.9993474941502067e-05, + "loss": 1.463, + "step": 95 + }, + { + "epoch": 0.07265846736045412, + "grad_norm": 7.073038578033447, + "learning_rate": 1.9993181749980168e-05, + "loss": 1.509, + "step": 96 + }, + { + "epoch": 0.07341532639545885, + "grad_norm": 6.401993274688721, + "learning_rate": 1.99928821176503e-05, + "loss": 1.4958, + "step": 97 + }, + { + "epoch": 0.07417218543046358, + "grad_norm": 6.126581192016602, + "learning_rate": 1.9992576044705596e-05, + "loss": 1.4449, + "step": 98 + }, + { + "epoch": 0.0749290444654683, + "grad_norm": 8.766273498535156, + "learning_rate": 1.9992263531343348e-05, + "loss": 1.5218, + "step": 99 + }, + { + "epoch": 0.07568590350047304, + "grad_norm": 5.65410852432251, + "learning_rate": 1.9991944577764996e-05, + "loss": 1.5205, + "step": 100 + }, + { + "epoch": 0.07644276253547777, + "grad_norm": 5.447603702545166, + "learning_rate": 1.9991619184176136e-05, + "loss": 1.4651, + "step": 101 + }, + { + "epoch": 0.0771996215704825, + "grad_norm": 5.317190647125244, + "learning_rate": 1.9991287350786512e-05, + "loss": 1.5059, + "step": 102 + }, + { + "epoch": 0.07795648060548722, + "grad_norm": 5.233520984649658, + "learning_rate": 1.9990949077810015e-05, + "loss": 1.4556, + "step": 103 + }, + { + "epoch": 0.07871333964049196, + "grad_norm": 4.955499649047852, + "learning_rate": 1.9990604365464693e-05, + "loss": 1.4236, + "step": 104 + }, + { + "epoch": 0.07947019867549669, + "grad_norm": 9.175353050231934, + "learning_rate": 1.9990253213972742e-05, + "loss": 1.4482, + "step": 105 + }, + { + "epoch": 0.08022705771050143, + "grad_norm": 5.2216057777404785, + "learning_rate": 1.998989562356051e-05, + "loss": 1.4902, + "step": 106 + }, + { + "epoch": 0.08098391674550615, + "grad_norm": 4.395474910736084, + "learning_rate": 1.9989531594458487e-05, + "loss": 1.4419, + "step": 107 + }, + { + "epoch": 0.08174077578051088, + "grad_norm": 4.641335487365723, + "learning_rate": 1.998916112690133e-05, + "loss": 1.4715, + "step": 108 + }, + { + "epoch": 0.08249763481551561, + "grad_norm": 5.315745830535889, + "learning_rate": 1.9988784221127834e-05, + "loss": 1.4742, + "step": 109 + }, + { + "epoch": 0.08325449385052035, + "grad_norm": 5.404274940490723, + "learning_rate": 1.998840087738095e-05, + "loss": 1.4579, + "step": 110 + }, + { + "epoch": 0.08401135288552507, + "grad_norm": 4.288702011108398, + "learning_rate": 1.9988011095907768e-05, + "loss": 1.49, + "step": 111 + }, + { + "epoch": 0.0847682119205298, + "grad_norm": 4.434887409210205, + "learning_rate": 1.9987614876959536e-05, + "loss": 1.4946, + "step": 112 + }, + { + "epoch": 0.08552507095553454, + "grad_norm": 5.428564071655273, + "learning_rate": 1.9987212220791657e-05, + "loss": 1.3817, + "step": 113 + }, + { + "epoch": 0.08628192999053927, + "grad_norm": 3.9893720149993896, + "learning_rate": 1.9986803127663672e-05, + "loss": 1.4428, + "step": 114 + }, + { + "epoch": 0.08703878902554399, + "grad_norm": 4.35543966293335, + "learning_rate": 1.998638759783928e-05, + "loss": 1.3801, + "step": 115 + }, + { + "epoch": 0.08779564806054872, + "grad_norm": 4.2772722244262695, + "learning_rate": 1.9985965631586318e-05, + "loss": 1.3975, + "step": 116 + }, + { + "epoch": 0.08855250709555346, + "grad_norm": 4.769036769866943, + "learning_rate": 1.9985537229176787e-05, + "loss": 1.4413, + "step": 117 + }, + { + "epoch": 0.08930936613055819, + "grad_norm": 4.7659759521484375, + "learning_rate": 1.9985102390886825e-05, + "loss": 1.4665, + "step": 118 + }, + { + "epoch": 0.09006622516556291, + "grad_norm": 5.218923091888428, + "learning_rate": 1.9984661116996723e-05, + "loss": 1.4544, + "step": 119 + }, + { + "epoch": 0.09082308420056764, + "grad_norm": 4.296699047088623, + "learning_rate": 1.9984213407790924e-05, + "loss": 1.3944, + "step": 120 + }, + { + "epoch": 0.09157994323557238, + "grad_norm": 3.866936683654785, + "learning_rate": 1.9983759263558003e-05, + "loss": 1.4273, + "step": 121 + }, + { + "epoch": 0.09233680227057711, + "grad_norm": 4.711172103881836, + "learning_rate": 1.99832986845907e-05, + "loss": 1.4208, + "step": 122 + }, + { + "epoch": 0.09309366130558183, + "grad_norm": 4.553902626037598, + "learning_rate": 1.9982831671185905e-05, + "loss": 1.525, + "step": 123 + }, + { + "epoch": 0.09385052034058657, + "grad_norm": 4.0878801345825195, + "learning_rate": 1.9982358223644635e-05, + "loss": 1.4621, + "step": 124 + }, + { + "epoch": 0.0946073793755913, + "grad_norm": 4.239192485809326, + "learning_rate": 1.9981878342272074e-05, + "loss": 1.4151, + "step": 125 + }, + { + "epoch": 0.09536423841059603, + "grad_norm": 3.9742391109466553, + "learning_rate": 1.9981392027377548e-05, + "loss": 1.4588, + "step": 126 + }, + { + "epoch": 0.09612109744560075, + "grad_norm": 4.459286212921143, + "learning_rate": 1.9980899279274523e-05, + "loss": 1.414, + "step": 127 + }, + { + "epoch": 0.09687795648060549, + "grad_norm": 4.164027214050293, + "learning_rate": 1.9980400098280622e-05, + "loss": 1.4572, + "step": 128 + }, + { + "epoch": 0.09763481551561022, + "grad_norm": 4.778876304626465, + "learning_rate": 1.9979894484717604e-05, + "loss": 1.4279, + "step": 129 + }, + { + "epoch": 0.09839167455061495, + "grad_norm": 4.639044761657715, + "learning_rate": 1.9979382438911383e-05, + "loss": 1.4343, + "step": 130 + }, + { + "epoch": 0.09914853358561967, + "grad_norm": 4.090446949005127, + "learning_rate": 1.9978863961192018e-05, + "loss": 1.4802, + "step": 131 + }, + { + "epoch": 0.09990539262062441, + "grad_norm": 4.360771656036377, + "learning_rate": 1.9978339051893702e-05, + "loss": 1.4552, + "step": 132 + }, + { + "epoch": 0.10066225165562914, + "grad_norm": 3.851464033126831, + "learning_rate": 1.9977807711354796e-05, + "loss": 1.3779, + "step": 133 + }, + { + "epoch": 0.10141911069063388, + "grad_norm": 4.016122341156006, + "learning_rate": 1.997726993991779e-05, + "loss": 1.4313, + "step": 134 + }, + { + "epoch": 0.1021759697256386, + "grad_norm": 4.009467124938965, + "learning_rate": 1.997672573792932e-05, + "loss": 1.491, + "step": 135 + }, + { + "epoch": 0.10293282876064333, + "grad_norm": 3.8596322536468506, + "learning_rate": 1.997617510574018e-05, + "loss": 1.4724, + "step": 136 + }, + { + "epoch": 0.10368968779564806, + "grad_norm": 3.6011574268341064, + "learning_rate": 1.9975618043705282e-05, + "loss": 1.3931, + "step": 137 + }, + { + "epoch": 0.1044465468306528, + "grad_norm": 4.025736331939697, + "learning_rate": 1.997505455218371e-05, + "loss": 1.4269, + "step": 138 + }, + { + "epoch": 0.10520340586565752, + "grad_norm": 3.760977268218994, + "learning_rate": 1.9974484631538685e-05, + "loss": 1.4311, + "step": 139 + }, + { + "epoch": 0.10596026490066225, + "grad_norm": 4.554644584655762, + "learning_rate": 1.9973908282137565e-05, + "loss": 1.4535, + "step": 140 + }, + { + "epoch": 0.10671712393566699, + "grad_norm": 4.12142276763916, + "learning_rate": 1.9973325504351856e-05, + "loss": 1.4111, + "step": 141 + }, + { + "epoch": 0.10747398297067172, + "grad_norm": 3.9459025859832764, + "learning_rate": 1.9972736298557207e-05, + "loss": 1.4424, + "step": 142 + }, + { + "epoch": 0.10823084200567644, + "grad_norm": 3.65413236618042, + "learning_rate": 1.9972140665133412e-05, + "loss": 1.3589, + "step": 143 + }, + { + "epoch": 0.10898770104068117, + "grad_norm": 3.935250997543335, + "learning_rate": 1.997153860446441e-05, + "loss": 1.3985, + "step": 144 + }, + { + "epoch": 0.1097445600756859, + "grad_norm": 4.394814968109131, + "learning_rate": 1.9970930116938273e-05, + "loss": 1.4304, + "step": 145 + }, + { + "epoch": 0.11050141911069064, + "grad_norm": 3.6491141319274902, + "learning_rate": 1.997031520294723e-05, + "loss": 1.3928, + "step": 146 + }, + { + "epoch": 0.11125827814569536, + "grad_norm": 4.235386848449707, + "learning_rate": 1.9969693862887643e-05, + "loss": 1.4712, + "step": 147 + }, + { + "epoch": 0.1120151371807001, + "grad_norm": 4.189138412475586, + "learning_rate": 1.996906609716002e-05, + "loss": 1.3609, + "step": 148 + }, + { + "epoch": 0.11277199621570483, + "grad_norm": 3.729450225830078, + "learning_rate": 1.9968431906169005e-05, + "loss": 1.4229, + "step": 149 + }, + { + "epoch": 0.11352885525070956, + "grad_norm": 3.915863513946533, + "learning_rate": 1.996779129032339e-05, + "loss": 1.3628, + "step": 150 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 4.461569786071777, + "learning_rate": 1.9967144250036104e-05, + "loss": 1.4087, + "step": 151 + }, + { + "epoch": 0.11504257332071902, + "grad_norm": 4.412698745727539, + "learning_rate": 1.9966490785724223e-05, + "loss": 1.4392, + "step": 152 + }, + { + "epoch": 0.11579943235572375, + "grad_norm": 4.236743450164795, + "learning_rate": 1.9965830897808954e-05, + "loss": 1.4391, + "step": 153 + }, + { + "epoch": 0.11655629139072848, + "grad_norm": 4.672597408294678, + "learning_rate": 1.996516458671566e-05, + "loss": 1.3995, + "step": 154 + }, + { + "epoch": 0.1173131504257332, + "grad_norm": 5.059709072113037, + "learning_rate": 1.9964491852873833e-05, + "loss": 1.3566, + "step": 155 + }, + { + "epoch": 0.11807000946073794, + "grad_norm": 4.973750114440918, + "learning_rate": 1.99638126967171e-05, + "loss": 1.3993, + "step": 156 + }, + { + "epoch": 0.11882686849574267, + "grad_norm": 4.362597942352295, + "learning_rate": 1.996312711868324e-05, + "loss": 1.4254, + "step": 157 + }, + { + "epoch": 0.1195837275307474, + "grad_norm": 4.407685279846191, + "learning_rate": 1.9962435119214164e-05, + "loss": 1.3983, + "step": 158 + }, + { + "epoch": 0.12034058656575213, + "grad_norm": 4.614277362823486, + "learning_rate": 1.9961736698755928e-05, + "loss": 1.412, + "step": 159 + }, + { + "epoch": 0.12109744560075686, + "grad_norm": 4.18186092376709, + "learning_rate": 1.9961031857758718e-05, + "loss": 1.3653, + "step": 160 + }, + { + "epoch": 0.12185430463576159, + "grad_norm": 4.011139392852783, + "learning_rate": 1.9960320596676866e-05, + "loss": 1.4234, + "step": 161 + }, + { + "epoch": 0.12261116367076633, + "grad_norm": 4.428970813751221, + "learning_rate": 1.9959602915968842e-05, + "loss": 1.3899, + "step": 162 + }, + { + "epoch": 0.12336802270577105, + "grad_norm": 4.968282222747803, + "learning_rate": 1.995887881609725e-05, + "loss": 1.4235, + "step": 163 + }, + { + "epoch": 0.12412488174077578, + "grad_norm": 4.600246906280518, + "learning_rate": 1.9958148297528833e-05, + "loss": 1.3656, + "step": 164 + }, + { + "epoch": 0.12488174077578051, + "grad_norm": 4.392306804656982, + "learning_rate": 1.9957411360734476e-05, + "loss": 1.3804, + "step": 165 + }, + { + "epoch": 0.12563859981078523, + "grad_norm": 4.066370964050293, + "learning_rate": 1.995666800618919e-05, + "loss": 1.4013, + "step": 166 + }, + { + "epoch": 0.12639545884578998, + "grad_norm": 3.9358901977539062, + "learning_rate": 1.995591823437214e-05, + "loss": 1.3887, + "step": 167 + }, + { + "epoch": 0.1271523178807947, + "grad_norm": 4.182436466217041, + "learning_rate": 1.9955162045766607e-05, + "loss": 1.4011, + "step": 168 + }, + { + "epoch": 0.12790917691579942, + "grad_norm": 3.8715391159057617, + "learning_rate": 1.9954399440860026e-05, + "loss": 1.3881, + "step": 169 + }, + { + "epoch": 0.12866603595080417, + "grad_norm": 4.34489107131958, + "learning_rate": 1.9953630420143958e-05, + "loss": 1.3679, + "step": 170 + }, + { + "epoch": 0.1294228949858089, + "grad_norm": 4.175931930541992, + "learning_rate": 1.9952854984114097e-05, + "loss": 1.3995, + "step": 171 + }, + { + "epoch": 0.13017975402081364, + "grad_norm": 6.231164932250977, + "learning_rate": 1.9952073133270288e-05, + "loss": 1.3826, + "step": 172 + }, + { + "epoch": 0.13093661305581836, + "grad_norm": 3.969299554824829, + "learning_rate": 1.9951284868116495e-05, + "loss": 1.427, + "step": 173 + }, + { + "epoch": 0.13169347209082308, + "grad_norm": 5.056988716125488, + "learning_rate": 1.9950490189160818e-05, + "loss": 1.4377, + "step": 174 + }, + { + "epoch": 0.13245033112582782, + "grad_norm": 3.9641916751861572, + "learning_rate": 1.99496890969155e-05, + "loss": 1.3729, + "step": 175 + }, + { + "epoch": 0.13320719016083254, + "grad_norm": 4.119785308837891, + "learning_rate": 1.9948881591896913e-05, + "loss": 1.4061, + "step": 176 + }, + { + "epoch": 0.13396404919583726, + "grad_norm": 4.154798984527588, + "learning_rate": 1.9948067674625557e-05, + "loss": 1.4383, + "step": 177 + }, + { + "epoch": 0.134720908230842, + "grad_norm": 4.396413326263428, + "learning_rate": 1.994724734562607e-05, + "loss": 1.3806, + "step": 178 + }, + { + "epoch": 0.13547776726584673, + "grad_norm": 10.802559852600098, + "learning_rate": 1.9946420605427235e-05, + "loss": 1.4279, + "step": 179 + }, + { + "epoch": 0.13623462630085148, + "grad_norm": 4.602297782897949, + "learning_rate": 1.9945587454561944e-05, + "loss": 1.3618, + "step": 180 + }, + { + "epoch": 0.1369914853358562, + "grad_norm": 4.874974727630615, + "learning_rate": 1.994474789356724e-05, + "loss": 1.3582, + "step": 181 + }, + { + "epoch": 0.13774834437086092, + "grad_norm": 5.023828983306885, + "learning_rate": 1.994390192298429e-05, + "loss": 1.3445, + "step": 182 + }, + { + "epoch": 0.13850520340586567, + "grad_norm": 4.938666343688965, + "learning_rate": 1.994304954335839e-05, + "loss": 1.4221, + "step": 183 + }, + { + "epoch": 0.1392620624408704, + "grad_norm": 5.975377559661865, + "learning_rate": 1.9942190755238973e-05, + "loss": 1.3947, + "step": 184 + }, + { + "epoch": 0.1400189214758751, + "grad_norm": 8.078311920166016, + "learning_rate": 1.9941325559179608e-05, + "loss": 1.3925, + "step": 185 + }, + { + "epoch": 0.14077578051087986, + "grad_norm": 5.0124897956848145, + "learning_rate": 1.9940453955737976e-05, + "loss": 1.3958, + "step": 186 + }, + { + "epoch": 0.14153263954588458, + "grad_norm": 4.94537353515625, + "learning_rate": 1.9939575945475905e-05, + "loss": 1.3855, + "step": 187 + }, + { + "epoch": 0.14228949858088932, + "grad_norm": 5.828818321228027, + "learning_rate": 1.9938691528959348e-05, + "loss": 1.4567, + "step": 188 + }, + { + "epoch": 0.14304635761589404, + "grad_norm": 4.672356605529785, + "learning_rate": 1.993780070675838e-05, + "loss": 1.3581, + "step": 189 + }, + { + "epoch": 0.14380321665089876, + "grad_norm": 5.052429676055908, + "learning_rate": 1.993690347944722e-05, + "loss": 1.3874, + "step": 190 + }, + { + "epoch": 0.1445600756859035, + "grad_norm": 4.454349040985107, + "learning_rate": 1.9935999847604204e-05, + "loss": 1.4282, + "step": 191 + }, + { + "epoch": 0.14531693472090823, + "grad_norm": 4.81812858581543, + "learning_rate": 1.9935089811811794e-05, + "loss": 1.4103, + "step": 192 + }, + { + "epoch": 0.14607379375591295, + "grad_norm": 3.8706412315368652, + "learning_rate": 1.993417337265659e-05, + "loss": 1.4024, + "step": 193 + }, + { + "epoch": 0.1468306527909177, + "grad_norm": 3.948594093322754, + "learning_rate": 1.9933250530729314e-05, + "loss": 1.387, + "step": 194 + }, + { + "epoch": 0.14758751182592242, + "grad_norm": 4.73719596862793, + "learning_rate": 1.993232128662482e-05, + "loss": 1.4528, + "step": 195 + }, + { + "epoch": 0.14834437086092717, + "grad_norm": 3.9017584323883057, + "learning_rate": 1.993138564094208e-05, + "loss": 1.4245, + "step": 196 + }, + { + "epoch": 0.14910122989593189, + "grad_norm": 6.6446309089660645, + "learning_rate": 1.9930443594284193e-05, + "loss": 1.4046, + "step": 197 + }, + { + "epoch": 0.1498580889309366, + "grad_norm": 4.191623210906982, + "learning_rate": 1.9929495147258395e-05, + "loss": 1.3987, + "step": 198 + }, + { + "epoch": 0.15061494796594135, + "grad_norm": 3.8362607955932617, + "learning_rate": 1.992854030047604e-05, + "loss": 1.3583, + "step": 199 + }, + { + "epoch": 0.15137180700094607, + "grad_norm": 4.051894187927246, + "learning_rate": 1.9927579054552603e-05, + "loss": 1.3856, + "step": 200 + }, + { + "epoch": 0.1521286660359508, + "grad_norm": 3.792412281036377, + "learning_rate": 1.992661141010769e-05, + "loss": 1.3961, + "step": 201 + }, + { + "epoch": 0.15288552507095554, + "grad_norm": 3.697641134262085, + "learning_rate": 1.992563736776503e-05, + "loss": 1.3808, + "step": 202 + }, + { + "epoch": 0.15364238410596026, + "grad_norm": 4.134721279144287, + "learning_rate": 1.992465692815248e-05, + "loss": 1.3594, + "step": 203 + }, + { + "epoch": 0.154399243140965, + "grad_norm": 4.171304225921631, + "learning_rate": 1.9923670091902013e-05, + "loss": 1.4217, + "step": 204 + }, + { + "epoch": 0.15515610217596973, + "grad_norm": 3.476039171218872, + "learning_rate": 1.992267685964973e-05, + "loss": 1.3967, + "step": 205 + }, + { + "epoch": 0.15591296121097445, + "grad_norm": 3.4347240924835205, + "learning_rate": 1.9921677232035846e-05, + "loss": 1.3422, + "step": 206 + }, + { + "epoch": 0.1566698202459792, + "grad_norm": 3.7200000286102295, + "learning_rate": 1.992067120970472e-05, + "loss": 1.3538, + "step": 207 + }, + { + "epoch": 0.15742667928098392, + "grad_norm": 3.8184263706207275, + "learning_rate": 1.9919658793304804e-05, + "loss": 1.3956, + "step": 208 + }, + { + "epoch": 0.15818353831598864, + "grad_norm": 3.761478900909424, + "learning_rate": 1.9918639983488694e-05, + "loss": 1.4233, + "step": 209 + }, + { + "epoch": 0.15894039735099338, + "grad_norm": 3.587502956390381, + "learning_rate": 1.99176147809131e-05, + "loss": 1.3514, + "step": 210 + }, + { + "epoch": 0.1596972563859981, + "grad_norm": 3.3828699588775635, + "learning_rate": 1.9916583186238847e-05, + "loss": 1.3766, + "step": 211 + }, + { + "epoch": 0.16045411542100285, + "grad_norm": 3.2444939613342285, + "learning_rate": 1.9915545200130893e-05, + "loss": 1.4051, + "step": 212 + }, + { + "epoch": 0.16121097445600757, + "grad_norm": 3.4360880851745605, + "learning_rate": 1.9914500823258298e-05, + "loss": 1.3364, + "step": 213 + }, + { + "epoch": 0.1619678334910123, + "grad_norm": 3.3002805709838867, + "learning_rate": 1.9913450056294255e-05, + "loss": 1.3807, + "step": 214 + }, + { + "epoch": 0.16272469252601704, + "grad_norm": 3.551203489303589, + "learning_rate": 1.991239289991608e-05, + "loss": 1.4077, + "step": 215 + }, + { + "epoch": 0.16348155156102176, + "grad_norm": 2.9857335090637207, + "learning_rate": 1.991132935480519e-05, + "loss": 1.3667, + "step": 216 + }, + { + "epoch": 0.16423841059602648, + "grad_norm": 3.935084342956543, + "learning_rate": 1.9910259421647136e-05, + "loss": 1.3973, + "step": 217 + }, + { + "epoch": 0.16499526963103123, + "grad_norm": 3.209479570388794, + "learning_rate": 1.9909183101131576e-05, + "loss": 1.3752, + "step": 218 + }, + { + "epoch": 0.16575212866603595, + "grad_norm": 3.311500072479248, + "learning_rate": 1.9908100393952293e-05, + "loss": 1.3566, + "step": 219 + }, + { + "epoch": 0.1665089877010407, + "grad_norm": 3.0751259326934814, + "learning_rate": 1.990701130080718e-05, + "loss": 1.411, + "step": 220 + }, + { + "epoch": 0.16726584673604541, + "grad_norm": 3.3133180141448975, + "learning_rate": 1.9905915822398257e-05, + "loss": 1.4006, + "step": 221 + }, + { + "epoch": 0.16802270577105013, + "grad_norm": 3.2017252445220947, + "learning_rate": 1.9904813959431646e-05, + "loss": 1.4028, + "step": 222 + }, + { + "epoch": 0.16877956480605488, + "grad_norm": 3.404691219329834, + "learning_rate": 1.9903705712617595e-05, + "loss": 1.355, + "step": 223 + }, + { + "epoch": 0.1695364238410596, + "grad_norm": 3.1049623489379883, + "learning_rate": 1.990259108267046e-05, + "loss": 1.3305, + "step": 224 + }, + { + "epoch": 0.17029328287606432, + "grad_norm": 3.3933444023132324, + "learning_rate": 1.990147007030871e-05, + "loss": 1.3718, + "step": 225 + }, + { + "epoch": 0.17105014191106907, + "grad_norm": 3.479591131210327, + "learning_rate": 1.9900342676254945e-05, + "loss": 1.393, + "step": 226 + }, + { + "epoch": 0.1718070009460738, + "grad_norm": 3.3810219764709473, + "learning_rate": 1.989920890123586e-05, + "loss": 1.3864, + "step": 227 + }, + { + "epoch": 0.17256385998107854, + "grad_norm": 3.4179928302764893, + "learning_rate": 1.9898068745982263e-05, + "loss": 1.3322, + "step": 228 + }, + { + "epoch": 0.17332071901608326, + "grad_norm": 3.288922071456909, + "learning_rate": 1.9896922211229088e-05, + "loss": 1.3738, + "step": 229 + }, + { + "epoch": 0.17407757805108798, + "grad_norm": 3.4045164585113525, + "learning_rate": 1.9895769297715373e-05, + "loss": 1.3509, + "step": 230 + }, + { + "epoch": 0.17483443708609273, + "grad_norm": 3.384779453277588, + "learning_rate": 1.9894610006184264e-05, + "loss": 1.3596, + "step": 231 + }, + { + "epoch": 0.17559129612109745, + "grad_norm": 3.6631815433502197, + "learning_rate": 1.989344433738303e-05, + "loss": 1.4126, + "step": 232 + }, + { + "epoch": 0.17634815515610217, + "grad_norm": 3.1958444118499756, + "learning_rate": 1.9892272292063034e-05, + "loss": 1.3711, + "step": 233 + }, + { + "epoch": 0.1771050141911069, + "grad_norm": 3.4087891578674316, + "learning_rate": 1.989109387097977e-05, + "loss": 1.3604, + "step": 234 + }, + { + "epoch": 0.17786187322611163, + "grad_norm": 3.5950968265533447, + "learning_rate": 1.988990907489282e-05, + "loss": 1.3464, + "step": 235 + }, + { + "epoch": 0.17861873226111638, + "grad_norm": 3.5223278999328613, + "learning_rate": 1.988871790456589e-05, + "loss": 1.3965, + "step": 236 + }, + { + "epoch": 0.1793755912961211, + "grad_norm": 3.259669780731201, + "learning_rate": 1.988752036076679e-05, + "loss": 1.3915, + "step": 237 + }, + { + "epoch": 0.18013245033112582, + "grad_norm": 3.1698622703552246, + "learning_rate": 1.9886316444267436e-05, + "loss": 1.3674, + "step": 238 + }, + { + "epoch": 0.18088930936613057, + "grad_norm": 3.336416482925415, + "learning_rate": 1.9885106155843857e-05, + "loss": 1.3476, + "step": 239 + }, + { + "epoch": 0.1816461684011353, + "grad_norm": 3.2986626625061035, + "learning_rate": 1.9883889496276188e-05, + "loss": 1.3139, + "step": 240 + }, + { + "epoch": 0.18240302743614, + "grad_norm": 3.2197721004486084, + "learning_rate": 1.9882666466348665e-05, + "loss": 1.3611, + "step": 241 + }, + { + "epoch": 0.18315988647114476, + "grad_norm": 3.120088815689087, + "learning_rate": 1.988143706684964e-05, + "loss": 1.364, + "step": 242 + }, + { + "epoch": 0.18391674550614948, + "grad_norm": 2.9464049339294434, + "learning_rate": 1.9880201298571558e-05, + "loss": 1.3295, + "step": 243 + }, + { + "epoch": 0.18467360454115422, + "grad_norm": 3.3369717597961426, + "learning_rate": 1.9878959162310983e-05, + "loss": 1.3669, + "step": 244 + }, + { + "epoch": 0.18543046357615894, + "grad_norm": 2.900787353515625, + "learning_rate": 1.987771065886857e-05, + "loss": 1.3565, + "step": 245 + }, + { + "epoch": 0.18618732261116366, + "grad_norm": 3.0211544036865234, + "learning_rate": 1.9876455789049096e-05, + "loss": 1.3882, + "step": 246 + }, + { + "epoch": 0.1869441816461684, + "grad_norm": 3.1576292514801025, + "learning_rate": 1.9875194553661415e-05, + "loss": 1.3075, + "step": 247 + }, + { + "epoch": 0.18770104068117313, + "grad_norm": 2.8850550651550293, + "learning_rate": 1.9873926953518515e-05, + "loss": 1.3665, + "step": 248 + }, + { + "epoch": 0.18845789971617785, + "grad_norm": 3.188582420349121, + "learning_rate": 1.9872652989437467e-05, + "loss": 1.3555, + "step": 249 + }, + { + "epoch": 0.1892147587511826, + "grad_norm": 4.313934803009033, + "learning_rate": 1.9871372662239446e-05, + "loss": 1.2937, + "step": 250 + }, + { + "epoch": 0.18997161778618732, + "grad_norm": 3.0744991302490234, + "learning_rate": 1.9870085972749733e-05, + "loss": 1.3289, + "step": 251 + }, + { + "epoch": 0.19072847682119207, + "grad_norm": 2.9217262268066406, + "learning_rate": 1.986879292179771e-05, + "loss": 1.33, + "step": 252 + }, + { + "epoch": 0.1914853358561968, + "grad_norm": 3.089919328689575, + "learning_rate": 1.986749351021686e-05, + "loss": 1.3532, + "step": 253 + }, + { + "epoch": 0.1922421948912015, + "grad_norm": 3.5609021186828613, + "learning_rate": 1.9866187738844753e-05, + "loss": 1.4002, + "step": 254 + }, + { + "epoch": 0.19299905392620625, + "grad_norm": 3.0856025218963623, + "learning_rate": 1.986487560852308e-05, + "loss": 1.3691, + "step": 255 + }, + { + "epoch": 0.19375591296121097, + "grad_norm": 2.679279327392578, + "learning_rate": 1.986355712009762e-05, + "loss": 1.3412, + "step": 256 + }, + { + "epoch": 0.1945127719962157, + "grad_norm": 3.1083905696868896, + "learning_rate": 1.9862232274418246e-05, + "loss": 1.3023, + "step": 257 + }, + { + "epoch": 0.19526963103122044, + "grad_norm": 2.726358413696289, + "learning_rate": 1.9860901072338936e-05, + "loss": 1.377, + "step": 258 + }, + { + "epoch": 0.19602649006622516, + "grad_norm": 2.966639995574951, + "learning_rate": 1.985956351471776e-05, + "loss": 1.304, + "step": 259 + }, + { + "epoch": 0.1967833491012299, + "grad_norm": 2.8776400089263916, + "learning_rate": 1.9858219602416887e-05, + "loss": 1.3481, + "step": 260 + }, + { + "epoch": 0.19754020813623463, + "grad_norm": 3.0099427700042725, + "learning_rate": 1.9856869336302588e-05, + "loss": 1.4332, + "step": 261 + }, + { + "epoch": 0.19829706717123935, + "grad_norm": 3.146959066390991, + "learning_rate": 1.985551271724522e-05, + "loss": 1.3372, + "step": 262 + }, + { + "epoch": 0.1990539262062441, + "grad_norm": 3.076327323913574, + "learning_rate": 1.9854149746119232e-05, + "loss": 1.3258, + "step": 263 + }, + { + "epoch": 0.19981078524124882, + "grad_norm": 2.8409347534179688, + "learning_rate": 1.9852780423803187e-05, + "loss": 1.2975, + "step": 264 + }, + { + "epoch": 0.20056764427625354, + "grad_norm": 3.1386849880218506, + "learning_rate": 1.9851404751179723e-05, + "loss": 1.3395, + "step": 265 + }, + { + "epoch": 0.20132450331125828, + "grad_norm": 3.104682445526123, + "learning_rate": 1.9850022729135578e-05, + "loss": 1.3667, + "step": 266 + }, + { + "epoch": 0.202081362346263, + "grad_norm": 3.337529182434082, + "learning_rate": 1.9848634358561584e-05, + "loss": 1.3145, + "step": 267 + }, + { + "epoch": 0.20283822138126775, + "grad_norm": 3.380446195602417, + "learning_rate": 1.984723964035266e-05, + "loss": 1.364, + "step": 268 + }, + { + "epoch": 0.20359508041627247, + "grad_norm": 3.161867141723633, + "learning_rate": 1.9845838575407824e-05, + "loss": 1.3333, + "step": 269 + }, + { + "epoch": 0.2043519394512772, + "grad_norm": 3.323434352874756, + "learning_rate": 1.9844431164630178e-05, + "loss": 1.3897, + "step": 270 + }, + { + "epoch": 0.20510879848628194, + "grad_norm": 3.4208099842071533, + "learning_rate": 1.984301740892692e-05, + "loss": 1.333, + "step": 271 + }, + { + "epoch": 0.20586565752128666, + "grad_norm": 3.178248643875122, + "learning_rate": 1.984159730920933e-05, + "loss": 1.3033, + "step": 272 + }, + { + "epoch": 0.20662251655629138, + "grad_norm": 3.0145297050476074, + "learning_rate": 1.9840170866392795e-05, + "loss": 1.3055, + "step": 273 + }, + { + "epoch": 0.20737937559129613, + "grad_norm": 3.6076059341430664, + "learning_rate": 1.9838738081396764e-05, + "loss": 1.3442, + "step": 274 + }, + { + "epoch": 0.20813623462630085, + "grad_norm": 3.3622937202453613, + "learning_rate": 1.9837298955144796e-05, + "loss": 1.3666, + "step": 275 + }, + { + "epoch": 0.2088930936613056, + "grad_norm": 3.782317876815796, + "learning_rate": 1.9835853488564527e-05, + "loss": 1.3791, + "step": 276 + }, + { + "epoch": 0.20964995269631032, + "grad_norm": 3.1874301433563232, + "learning_rate": 1.9834401682587688e-05, + "loss": 1.3703, + "step": 277 + }, + { + "epoch": 0.21040681173131504, + "grad_norm": 3.0065550804138184, + "learning_rate": 1.9832943538150083e-05, + "loss": 1.331, + "step": 278 + }, + { + "epoch": 0.21116367076631978, + "grad_norm": 3.953733444213867, + "learning_rate": 1.9831479056191618e-05, + "loss": 1.3855, + "step": 279 + }, + { + "epoch": 0.2119205298013245, + "grad_norm": 3.682438611984253, + "learning_rate": 1.983000823765627e-05, + "loss": 1.3605, + "step": 280 + }, + { + "epoch": 0.21267738883632922, + "grad_norm": 3.57037615776062, + "learning_rate": 1.9828531083492102e-05, + "loss": 1.3048, + "step": 281 + }, + { + "epoch": 0.21343424787133397, + "grad_norm": 3.4117233753204346, + "learning_rate": 1.9827047594651275e-05, + "loss": 1.3606, + "step": 282 + }, + { + "epoch": 0.2141911069063387, + "grad_norm": 2.914785623550415, + "learning_rate": 1.982555777209002e-05, + "loss": 1.3596, + "step": 283 + }, + { + "epoch": 0.21494796594134344, + "grad_norm": 3.271235942840576, + "learning_rate": 1.9824061616768652e-05, + "loss": 1.3208, + "step": 284 + }, + { + "epoch": 0.21570482497634816, + "grad_norm": 3.3142642974853516, + "learning_rate": 1.982255912965157e-05, + "loss": 1.3574, + "step": 285 + }, + { + "epoch": 0.21646168401135288, + "grad_norm": 3.752458095550537, + "learning_rate": 1.9821050311707253e-05, + "loss": 1.3818, + "step": 286 + }, + { + "epoch": 0.21721854304635763, + "grad_norm": 3.1010730266571045, + "learning_rate": 1.9819535163908266e-05, + "loss": 1.2799, + "step": 287 + }, + { + "epoch": 0.21797540208136235, + "grad_norm": 3.3089754581451416, + "learning_rate": 1.9818013687231252e-05, + "loss": 1.3719, + "step": 288 + }, + { + "epoch": 0.21873226111636707, + "grad_norm": 3.800584316253662, + "learning_rate": 1.9816485882656925e-05, + "loss": 1.3458, + "step": 289 + }, + { + "epoch": 0.2194891201513718, + "grad_norm": 3.5390021800994873, + "learning_rate": 1.9814951751170087e-05, + "loss": 1.3558, + "step": 290 + }, + { + "epoch": 0.22024597918637653, + "grad_norm": 3.37929630279541, + "learning_rate": 1.9813411293759618e-05, + "loss": 1.3236, + "step": 291 + }, + { + "epoch": 0.22100283822138128, + "grad_norm": 3.255699872970581, + "learning_rate": 1.9811864511418467e-05, + "loss": 1.3245, + "step": 292 + }, + { + "epoch": 0.221759697256386, + "grad_norm": 3.8194658756256104, + "learning_rate": 1.981031140514367e-05, + "loss": 1.3381, + "step": 293 + }, + { + "epoch": 0.22251655629139072, + "grad_norm": 3.8124804496765137, + "learning_rate": 1.9808751975936344e-05, + "loss": 1.3006, + "step": 294 + }, + { + "epoch": 0.22327341532639547, + "grad_norm": 3.637120246887207, + "learning_rate": 1.980718622480166e-05, + "loss": 1.3411, + "step": 295 + }, + { + "epoch": 0.2240302743614002, + "grad_norm": 3.8235883712768555, + "learning_rate": 1.9805614152748887e-05, + "loss": 1.3285, + "step": 296 + }, + { + "epoch": 0.2247871333964049, + "grad_norm": 3.892608642578125, + "learning_rate": 1.980403576079135e-05, + "loss": 1.3015, + "step": 297 + }, + { + "epoch": 0.22554399243140966, + "grad_norm": 3.9942359924316406, + "learning_rate": 1.9802451049946468e-05, + "loss": 1.3404, + "step": 298 + }, + { + "epoch": 0.22630085146641438, + "grad_norm": 3.8982861042022705, + "learning_rate": 1.9800860021235708e-05, + "loss": 1.3194, + "step": 299 + }, + { + "epoch": 0.22705771050141912, + "grad_norm": 4.402480125427246, + "learning_rate": 1.979926267568463e-05, + "loss": 1.3383, + "step": 300 + }, + { + "epoch": 0.22781456953642384, + "grad_norm": 3.9718708992004395, + "learning_rate": 1.979765901432286e-05, + "loss": 1.301, + "step": 301 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 3.8678481578826904, + "learning_rate": 1.979604903818409e-05, + "loss": 1.3269, + "step": 302 + }, + { + "epoch": 0.2293282876064333, + "grad_norm": 3.352957010269165, + "learning_rate": 1.979443274830609e-05, + "loss": 1.3332, + "step": 303 + }, + { + "epoch": 0.23008514664143803, + "grad_norm": 3.937535524368286, + "learning_rate": 1.9792810145730696e-05, + "loss": 1.3464, + "step": 304 + }, + { + "epoch": 0.23084200567644275, + "grad_norm": 4.081162929534912, + "learning_rate": 1.9791181231503804e-05, + "loss": 1.327, + "step": 305 + }, + { + "epoch": 0.2315988647114475, + "grad_norm": 3.5600180625915527, + "learning_rate": 1.97895460066754e-05, + "loss": 1.3536, + "step": 306 + }, + { + "epoch": 0.23235572374645222, + "grad_norm": 3.9321706295013428, + "learning_rate": 1.9787904472299512e-05, + "loss": 1.2999, + "step": 307 + }, + { + "epoch": 0.23311258278145697, + "grad_norm": 4.384609699249268, + "learning_rate": 1.978625662943426e-05, + "loss": 1.3461, + "step": 308 + }, + { + "epoch": 0.2338694418164617, + "grad_norm": 4.421790599822998, + "learning_rate": 1.978460247914181e-05, + "loss": 1.3324, + "step": 309 + }, + { + "epoch": 0.2346263008514664, + "grad_norm": 4.101651191711426, + "learning_rate": 1.9782942022488404e-05, + "loss": 1.2738, + "step": 310 + }, + { + "epoch": 0.23538315988647115, + "grad_norm": 4.219285488128662, + "learning_rate": 1.978127526054435e-05, + "loss": 1.3519, + "step": 311 + }, + { + "epoch": 0.23614001892147587, + "grad_norm": 3.5981838703155518, + "learning_rate": 1.9779602194384014e-05, + "loss": 1.3546, + "step": 312 + }, + { + "epoch": 0.2368968779564806, + "grad_norm": 3.758359432220459, + "learning_rate": 1.9777922825085835e-05, + "loss": 1.3264, + "step": 313 + }, + { + "epoch": 0.23765373699148534, + "grad_norm": 3.7645103931427, + "learning_rate": 1.97762371537323e-05, + "loss": 1.3135, + "step": 314 + }, + { + "epoch": 0.23841059602649006, + "grad_norm": 3.3905699253082275, + "learning_rate": 1.9774545181409973e-05, + "loss": 1.2848, + "step": 315 + }, + { + "epoch": 0.2391674550614948, + "grad_norm": 3.6732635498046875, + "learning_rate": 1.9772846909209473e-05, + "loss": 1.3487, + "step": 316 + }, + { + "epoch": 0.23992431409649953, + "grad_norm": 3.8122737407684326, + "learning_rate": 1.9771142338225476e-05, + "loss": 1.333, + "step": 317 + }, + { + "epoch": 0.24068117313150425, + "grad_norm": 4.025964260101318, + "learning_rate": 1.9769431469556728e-05, + "loss": 1.3431, + "step": 318 + }, + { + "epoch": 0.241438032166509, + "grad_norm": 3.054323196411133, + "learning_rate": 1.9767714304306024e-05, + "loss": 1.3279, + "step": 319 + }, + { + "epoch": 0.24219489120151372, + "grad_norm": 4.698709964752197, + "learning_rate": 1.9765990843580227e-05, + "loss": 1.3209, + "step": 320 + }, + { + "epoch": 0.24295175023651844, + "grad_norm": 3.341327428817749, + "learning_rate": 1.976426108849025e-05, + "loss": 1.3424, + "step": 321 + }, + { + "epoch": 0.24370860927152319, + "grad_norm": 3.9361190795898438, + "learning_rate": 1.9762525040151074e-05, + "loss": 1.3083, + "step": 322 + }, + { + "epoch": 0.2444654683065279, + "grad_norm": 3.340085506439209, + "learning_rate": 1.9760782699681716e-05, + "loss": 1.3358, + "step": 323 + }, + { + "epoch": 0.24522232734153265, + "grad_norm": 3.044618606567383, + "learning_rate": 1.9759034068205273e-05, + "loss": 1.3099, + "step": 324 + }, + { + "epoch": 0.24597918637653737, + "grad_norm": 3.619760274887085, + "learning_rate": 1.9757279146848883e-05, + "loss": 1.3455, + "step": 325 + }, + { + "epoch": 0.2467360454115421, + "grad_norm": 3.7121100425720215, + "learning_rate": 1.975551793674374e-05, + "loss": 1.3106, + "step": 326 + }, + { + "epoch": 0.24749290444654684, + "grad_norm": 3.5931692123413086, + "learning_rate": 1.9753750439025095e-05, + "loss": 1.2905, + "step": 327 + }, + { + "epoch": 0.24824976348155156, + "grad_norm": 3.603030204772949, + "learning_rate": 1.975197665483225e-05, + "loss": 1.3319, + "step": 328 + }, + { + "epoch": 0.24900662251655628, + "grad_norm": 3.6277918815612793, + "learning_rate": 1.9750196585308564e-05, + "loss": 1.3393, + "step": 329 + }, + { + "epoch": 0.24976348155156103, + "grad_norm": 3.5887362957000732, + "learning_rate": 1.974841023160143e-05, + "loss": 1.3866, + "step": 330 + }, + { + "epoch": 0.25052034058656575, + "grad_norm": 3.4283299446105957, + "learning_rate": 1.974661759486232e-05, + "loss": 1.329, + "step": 331 + }, + { + "epoch": 0.25127719962157047, + "grad_norm": 3.7355992794036865, + "learning_rate": 1.9744818676246724e-05, + "loss": 1.3129, + "step": 332 + }, + { + "epoch": 0.2520340586565752, + "grad_norm": 3.726663589477539, + "learning_rate": 1.974301347691421e-05, + "loss": 1.3665, + "step": 333 + }, + { + "epoch": 0.25279091769157996, + "grad_norm": 3.93129825592041, + "learning_rate": 1.9741201998028377e-05, + "loss": 1.3876, + "step": 334 + }, + { + "epoch": 0.2535477767265847, + "grad_norm": 3.588931083679199, + "learning_rate": 1.9739384240756873e-05, + "loss": 1.3715, + "step": 335 + }, + { + "epoch": 0.2543046357615894, + "grad_norm": 3.4406232833862305, + "learning_rate": 1.9737560206271404e-05, + "loss": 1.3013, + "step": 336 + }, + { + "epoch": 0.2550614947965941, + "grad_norm": 3.481201171875, + "learning_rate": 1.9735729895747714e-05, + "loss": 1.3625, + "step": 337 + }, + { + "epoch": 0.25581835383159884, + "grad_norm": 3.7452211380004883, + "learning_rate": 1.973389331036559e-05, + "loss": 1.3452, + "step": 338 + }, + { + "epoch": 0.2565752128666036, + "grad_norm": 3.8469581604003906, + "learning_rate": 1.973205045130887e-05, + "loss": 1.3824, + "step": 339 + }, + { + "epoch": 0.25733207190160834, + "grad_norm": 3.252890110015869, + "learning_rate": 1.9730201319765423e-05, + "loss": 1.311, + "step": 340 + }, + { + "epoch": 0.25808893093661306, + "grad_norm": 3.9583048820495605, + "learning_rate": 1.9728345916927187e-05, + "loss": 1.3244, + "step": 341 + }, + { + "epoch": 0.2588457899716178, + "grad_norm": 3.6613519191741943, + "learning_rate": 1.9726484243990115e-05, + "loss": 1.3539, + "step": 342 + }, + { + "epoch": 0.2596026490066225, + "grad_norm": 3.4180917739868164, + "learning_rate": 1.9724616302154218e-05, + "loss": 1.3353, + "step": 343 + }, + { + "epoch": 0.2603595080416273, + "grad_norm": 3.7470951080322266, + "learning_rate": 1.9722742092623536e-05, + "loss": 1.2864, + "step": 344 + }, + { + "epoch": 0.261116367076632, + "grad_norm": 4.141618251800537, + "learning_rate": 1.9720861616606165e-05, + "loss": 1.3486, + "step": 345 + }, + { + "epoch": 0.2618732261116367, + "grad_norm": 3.7161524295806885, + "learning_rate": 1.9718974875314226e-05, + "loss": 1.339, + "step": 346 + }, + { + "epoch": 0.26263008514664143, + "grad_norm": 4.011509895324707, + "learning_rate": 1.9717081869963887e-05, + "loss": 1.4027, + "step": 347 + }, + { + "epoch": 0.26338694418164615, + "grad_norm": 4.976902008056641, + "learning_rate": 1.9715182601775348e-05, + "loss": 1.3078, + "step": 348 + }, + { + "epoch": 0.2641438032166509, + "grad_norm": 3.8435733318328857, + "learning_rate": 1.9713277071972844e-05, + "loss": 1.3013, + "step": 349 + }, + { + "epoch": 0.26490066225165565, + "grad_norm": 3.3969762325286865, + "learning_rate": 1.971136528178466e-05, + "loss": 1.3078, + "step": 350 + }, + { + "epoch": 0.26565752128666037, + "grad_norm": 4.123608112335205, + "learning_rate": 1.9709447232443096e-05, + "loss": 1.3476, + "step": 351 + }, + { + "epoch": 0.2664143803216651, + "grad_norm": 3.974820137023926, + "learning_rate": 1.9707522925184507e-05, + "loss": 1.377, + "step": 352 + }, + { + "epoch": 0.2671712393566698, + "grad_norm": 4.08565616607666, + "learning_rate": 1.9705592361249267e-05, + "loss": 1.3559, + "step": 353 + }, + { + "epoch": 0.26792809839167453, + "grad_norm": 3.7338943481445312, + "learning_rate": 1.970365554188179e-05, + "loss": 1.2845, + "step": 354 + }, + { + "epoch": 0.2686849574266793, + "grad_norm": 3.806567430496216, + "learning_rate": 1.9701712468330518e-05, + "loss": 1.4283, + "step": 355 + }, + { + "epoch": 0.269441816461684, + "grad_norm": 3.4662294387817383, + "learning_rate": 1.9699763141847928e-05, + "loss": 1.3068, + "step": 356 + }, + { + "epoch": 0.27019867549668874, + "grad_norm": 3.5118749141693115, + "learning_rate": 1.9697807563690522e-05, + "loss": 1.266, + "step": 357 + }, + { + "epoch": 0.27095553453169346, + "grad_norm": 4.166219711303711, + "learning_rate": 1.969584573511885e-05, + "loss": 1.3355, + "step": 358 + }, + { + "epoch": 0.2717123935666982, + "grad_norm": 3.828523635864258, + "learning_rate": 1.969387765739746e-05, + "loss": 1.2712, + "step": 359 + }, + { + "epoch": 0.27246925260170296, + "grad_norm": 3.8785219192504883, + "learning_rate": 1.969190333179495e-05, + "loss": 1.2761, + "step": 360 + }, + { + "epoch": 0.2732261116367077, + "grad_norm": 3.772268056869507, + "learning_rate": 1.9689922759583947e-05, + "loss": 1.372, + "step": 361 + }, + { + "epoch": 0.2739829706717124, + "grad_norm": 3.7379493713378906, + "learning_rate": 1.968793594204109e-05, + "loss": 1.2843, + "step": 362 + }, + { + "epoch": 0.2747398297067171, + "grad_norm": 4.294455051422119, + "learning_rate": 1.9685942880447054e-05, + "loss": 1.3069, + "step": 363 + }, + { + "epoch": 0.27549668874172184, + "grad_norm": 4.1428728103637695, + "learning_rate": 1.9683943576086536e-05, + "loss": 1.366, + "step": 364 + }, + { + "epoch": 0.27625354777672656, + "grad_norm": 3.9030814170837402, + "learning_rate": 1.9681938030248257e-05, + "loss": 1.342, + "step": 365 + }, + { + "epoch": 0.27701040681173134, + "grad_norm": 4.4898681640625, + "learning_rate": 1.967992624422496e-05, + "loss": 1.2735, + "step": 366 + }, + { + "epoch": 0.27776726584673606, + "grad_norm": 4.548799514770508, + "learning_rate": 1.9677908219313414e-05, + "loss": 1.3589, + "step": 367 + }, + { + "epoch": 0.2785241248817408, + "grad_norm": 4.4808478355407715, + "learning_rate": 1.9675883956814403e-05, + "loss": 1.373, + "step": 368 + }, + { + "epoch": 0.2792809839167455, + "grad_norm": 4.146103858947754, + "learning_rate": 1.967385345803274e-05, + "loss": 1.2748, + "step": 369 + }, + { + "epoch": 0.2800378429517502, + "grad_norm": 5.006552696228027, + "learning_rate": 1.9671816724277254e-05, + "loss": 1.2852, + "step": 370 + }, + { + "epoch": 0.280794701986755, + "grad_norm": 4.279321670532227, + "learning_rate": 1.966977375686079e-05, + "loss": 1.3634, + "step": 371 + }, + { + "epoch": 0.2815515610217597, + "grad_norm": 5.318479537963867, + "learning_rate": 1.9667724557100214e-05, + "loss": 1.3184, + "step": 372 + }, + { + "epoch": 0.28230842005676443, + "grad_norm": 4.354931354522705, + "learning_rate": 1.966566912631641e-05, + "loss": 1.3018, + "step": 373 + }, + { + "epoch": 0.28306527909176915, + "grad_norm": 3.5126800537109375, + "learning_rate": 1.9663607465834275e-05, + "loss": 1.2811, + "step": 374 + }, + { + "epoch": 0.28382213812677387, + "grad_norm": 4.875300407409668, + "learning_rate": 1.9661539576982728e-05, + "loss": 1.3238, + "step": 375 + }, + { + "epoch": 0.28457899716177865, + "grad_norm": 4.699173450469971, + "learning_rate": 1.9659465461094692e-05, + "loss": 1.3223, + "step": 376 + }, + { + "epoch": 0.28533585619678337, + "grad_norm": 3.6528842449188232, + "learning_rate": 1.9657385119507118e-05, + "loss": 1.292, + "step": 377 + }, + { + "epoch": 0.2860927152317881, + "grad_norm": 3.849123239517212, + "learning_rate": 1.965529855356096e-05, + "loss": 1.3114, + "step": 378 + }, + { + "epoch": 0.2868495742667928, + "grad_norm": 3.7049927711486816, + "learning_rate": 1.9653205764601182e-05, + "loss": 1.3314, + "step": 379 + }, + { + "epoch": 0.2876064333017975, + "grad_norm": 4.335115909576416, + "learning_rate": 1.9651106753976768e-05, + "loss": 1.3719, + "step": 380 + }, + { + "epoch": 0.28836329233680225, + "grad_norm": 4.870954990386963, + "learning_rate": 1.964900152304071e-05, + "loss": 1.3264, + "step": 381 + }, + { + "epoch": 0.289120151371807, + "grad_norm": 4.583834648132324, + "learning_rate": 1.9646890073150005e-05, + "loss": 1.3743, + "step": 382 + }, + { + "epoch": 0.28987701040681174, + "grad_norm": 3.795956611633301, + "learning_rate": 1.964477240566566e-05, + "loss": 1.2997, + "step": 383 + }, + { + "epoch": 0.29063386944181646, + "grad_norm": 5.41873025894165, + "learning_rate": 1.9642648521952695e-05, + "loss": 1.3381, + "step": 384 + }, + { + "epoch": 0.2913907284768212, + "grad_norm": 4.2772393226623535, + "learning_rate": 1.9640518423380127e-05, + "loss": 1.3322, + "step": 385 + }, + { + "epoch": 0.2921475875118259, + "grad_norm": 10.241232872009277, + "learning_rate": 1.9638382111320996e-05, + "loss": 1.3249, + "step": 386 + }, + { + "epoch": 0.2929044465468307, + "grad_norm": 3.4204752445220947, + "learning_rate": 1.9636239587152323e-05, + "loss": 1.3295, + "step": 387 + }, + { + "epoch": 0.2936613055818354, + "grad_norm": 3.368516683578491, + "learning_rate": 1.9634090852255154e-05, + "loss": 1.3561, + "step": 388 + }, + { + "epoch": 0.2944181646168401, + "grad_norm": 3.5226809978485107, + "learning_rate": 1.9631935908014532e-05, + "loss": 1.3146, + "step": 389 + }, + { + "epoch": 0.29517502365184484, + "grad_norm": 3.446794271469116, + "learning_rate": 1.9629774755819495e-05, + "loss": 1.2973, + "step": 390 + }, + { + "epoch": 0.29593188268684956, + "grad_norm": 3.176982879638672, + "learning_rate": 1.9627607397063097e-05, + "loss": 1.3233, + "step": 391 + }, + { + "epoch": 0.29668874172185433, + "grad_norm": 3.0656180381774902, + "learning_rate": 1.9625433833142376e-05, + "loss": 1.3246, + "step": 392 + }, + { + "epoch": 0.29744560075685905, + "grad_norm": 3.4723055362701416, + "learning_rate": 1.9623254065458387e-05, + "loss": 1.3461, + "step": 393 + }, + { + "epoch": 0.29820245979186377, + "grad_norm": 3.2769827842712402, + "learning_rate": 1.962106809541616e-05, + "loss": 1.3242, + "step": 394 + }, + { + "epoch": 0.2989593188268685, + "grad_norm": 2.8769099712371826, + "learning_rate": 1.9618875924424756e-05, + "loss": 1.2548, + "step": 395 + }, + { + "epoch": 0.2997161778618732, + "grad_norm": 3.3671765327453613, + "learning_rate": 1.9616677553897204e-05, + "loss": 1.3241, + "step": 396 + }, + { + "epoch": 0.30047303689687793, + "grad_norm": 3.104637384414673, + "learning_rate": 1.9614472985250547e-05, + "loss": 1.3121, + "step": 397 + }, + { + "epoch": 0.3012298959318827, + "grad_norm": 3.6635613441467285, + "learning_rate": 1.9612262219905807e-05, + "loss": 1.3157, + "step": 398 + }, + { + "epoch": 0.3019867549668874, + "grad_norm": 3.4978229999542236, + "learning_rate": 1.9610045259288017e-05, + "loss": 1.3566, + "step": 399 + }, + { + "epoch": 0.30274361400189215, + "grad_norm": 3.084291458129883, + "learning_rate": 1.9607822104826198e-05, + "loss": 1.3157, + "step": 400 + }, + { + "epoch": 0.30350047303689687, + "grad_norm": 2.742034673690796, + "learning_rate": 1.9605592757953354e-05, + "loss": 1.2778, + "step": 401 + }, + { + "epoch": 0.3042573320719016, + "grad_norm": 2.89613938331604, + "learning_rate": 1.960335722010649e-05, + "loss": 1.3467, + "step": 402 + }, + { + "epoch": 0.30501419110690636, + "grad_norm": 3.3919119834899902, + "learning_rate": 1.9601115492726603e-05, + "loss": 1.3264, + "step": 403 + }, + { + "epoch": 0.3057710501419111, + "grad_norm": 2.5944290161132812, + "learning_rate": 1.9598867577258672e-05, + "loss": 1.3765, + "step": 404 + }, + { + "epoch": 0.3065279091769158, + "grad_norm": 2.6674866676330566, + "learning_rate": 1.9596613475151674e-05, + "loss": 1.3077, + "step": 405 + }, + { + "epoch": 0.3072847682119205, + "grad_norm": 2.9125916957855225, + "learning_rate": 1.9594353187858567e-05, + "loss": 1.3472, + "step": 406 + }, + { + "epoch": 0.30804162724692524, + "grad_norm": 2.746316909790039, + "learning_rate": 1.9592086716836292e-05, + "loss": 1.3137, + "step": 407 + }, + { + "epoch": 0.30879848628193, + "grad_norm": 3.5916221141815186, + "learning_rate": 1.958981406354579e-05, + "loss": 1.3181, + "step": 408 + }, + { + "epoch": 0.30955534531693474, + "grad_norm": 2.9677999019622803, + "learning_rate": 1.9587535229451973e-05, + "loss": 1.3094, + "step": 409 + }, + { + "epoch": 0.31031220435193946, + "grad_norm": 2.766179084777832, + "learning_rate": 1.9585250216023746e-05, + "loss": 1.3205, + "step": 410 + }, + { + "epoch": 0.3110690633869442, + "grad_norm": 3.019426107406616, + "learning_rate": 1.9582959024733992e-05, + "loss": 1.3053, + "step": 411 + }, + { + "epoch": 0.3118259224219489, + "grad_norm": 3.580401659011841, + "learning_rate": 1.9580661657059582e-05, + "loss": 1.3685, + "step": 412 + }, + { + "epoch": 0.3125827814569536, + "grad_norm": 3.2559759616851807, + "learning_rate": 1.957835811448136e-05, + "loss": 1.2975, + "step": 413 + }, + { + "epoch": 0.3133396404919584, + "grad_norm": 3.185425281524658, + "learning_rate": 1.957604839848415e-05, + "loss": 1.3391, + "step": 414 + }, + { + "epoch": 0.3140964995269631, + "grad_norm": 3.2222900390625, + "learning_rate": 1.9573732510556772e-05, + "loss": 1.2233, + "step": 415 + }, + { + "epoch": 0.31485335856196783, + "grad_norm": 3.3176467418670654, + "learning_rate": 1.9571410452192003e-05, + "loss": 1.32, + "step": 416 + }, + { + "epoch": 0.31561021759697255, + "grad_norm": 2.996213912963867, + "learning_rate": 1.9569082224886607e-05, + "loss": 1.3158, + "step": 417 + }, + { + "epoch": 0.3163670766319773, + "grad_norm": 2.757145881652832, + "learning_rate": 1.9566747830141327e-05, + "loss": 1.2747, + "step": 418 + }, + { + "epoch": 0.31712393566698205, + "grad_norm": 3.0630686283111572, + "learning_rate": 1.9564407269460873e-05, + "loss": 1.2863, + "step": 419 + }, + { + "epoch": 0.31788079470198677, + "grad_norm": 2.979710102081299, + "learning_rate": 1.956206054435394e-05, + "loss": 1.3017, + "step": 420 + }, + { + "epoch": 0.3186376537369915, + "grad_norm": 2.9305684566497803, + "learning_rate": 1.955970765633319e-05, + "loss": 1.2655, + "step": 421 + }, + { + "epoch": 0.3193945127719962, + "grad_norm": 3.2490427494049072, + "learning_rate": 1.955734860691526e-05, + "loss": 1.312, + "step": 422 + }, + { + "epoch": 0.32015137180700093, + "grad_norm": 2.647688388824463, + "learning_rate": 1.9554983397620754e-05, + "loss": 1.3009, + "step": 423 + }, + { + "epoch": 0.3209082308420057, + "grad_norm": 2.8015365600585938, + "learning_rate": 1.9552612029974246e-05, + "loss": 1.3069, + "step": 424 + }, + { + "epoch": 0.3216650898770104, + "grad_norm": 2.606043577194214, + "learning_rate": 1.9550234505504294e-05, + "loss": 1.2951, + "step": 425 + }, + { + "epoch": 0.32242194891201514, + "grad_norm": 2.9746274948120117, + "learning_rate": 1.9547850825743407e-05, + "loss": 1.2736, + "step": 426 + }, + { + "epoch": 0.32317880794701986, + "grad_norm": 3.0589208602905273, + "learning_rate": 1.9545460992228074e-05, + "loss": 1.3242, + "step": 427 + }, + { + "epoch": 0.3239356669820246, + "grad_norm": 3.041224956512451, + "learning_rate": 1.954306500649874e-05, + "loss": 1.3397, + "step": 428 + }, + { + "epoch": 0.3246925260170293, + "grad_norm": 2.700326681137085, + "learning_rate": 1.954066287009982e-05, + "loss": 1.2822, + "step": 429 + }, + { + "epoch": 0.3254493850520341, + "grad_norm": 2.7489256858825684, + "learning_rate": 1.95382545845797e-05, + "loss": 1.3056, + "step": 430 + }, + { + "epoch": 0.3262062440870388, + "grad_norm": 3.0966339111328125, + "learning_rate": 1.953584015149072e-05, + "loss": 1.3316, + "step": 431 + }, + { + "epoch": 0.3269631031220435, + "grad_norm": 2.661102533340454, + "learning_rate": 1.9533419572389186e-05, + "loss": 1.3017, + "step": 432 + }, + { + "epoch": 0.32771996215704824, + "grad_norm": 3.1965274810791016, + "learning_rate": 1.9530992848835367e-05, + "loss": 1.2975, + "step": 433 + }, + { + "epoch": 0.32847682119205296, + "grad_norm": 3.0282115936279297, + "learning_rate": 1.9528559982393497e-05, + "loss": 1.3261, + "step": 434 + }, + { + "epoch": 0.32923368022705773, + "grad_norm": 2.794201374053955, + "learning_rate": 1.9526120974631763e-05, + "loss": 1.3363, + "step": 435 + }, + { + "epoch": 0.32999053926206245, + "grad_norm": 2.8009607791900635, + "learning_rate": 1.9523675827122305e-05, + "loss": 1.2738, + "step": 436 + }, + { + "epoch": 0.3307473982970672, + "grad_norm": 3.1605050563812256, + "learning_rate": 1.952122454144123e-05, + "loss": 1.3242, + "step": 437 + }, + { + "epoch": 0.3315042573320719, + "grad_norm": 2.7758185863494873, + "learning_rate": 1.9518767119168608e-05, + "loss": 1.2546, + "step": 438 + }, + { + "epoch": 0.3322611163670766, + "grad_norm": 3.3435556888580322, + "learning_rate": 1.9516303561888446e-05, + "loss": 1.2966, + "step": 439 + }, + { + "epoch": 0.3330179754020814, + "grad_norm": 3.4312620162963867, + "learning_rate": 1.9513833871188724e-05, + "loss": 1.328, + "step": 440 + }, + { + "epoch": 0.3337748344370861, + "grad_norm": 3.4291491508483887, + "learning_rate": 1.951135804866136e-05, + "loss": 1.2927, + "step": 441 + }, + { + "epoch": 0.33453169347209083, + "grad_norm": 2.797574281692505, + "learning_rate": 1.9508876095902236e-05, + "loss": 1.3218, + "step": 442 + }, + { + "epoch": 0.33528855250709555, + "grad_norm": 3.1859307289123535, + "learning_rate": 1.9506388014511176e-05, + "loss": 1.2827, + "step": 443 + }, + { + "epoch": 0.33604541154210027, + "grad_norm": 3.4026360511779785, + "learning_rate": 1.950389380609196e-05, + "loss": 1.2879, + "step": 444 + }, + { + "epoch": 0.336802270577105, + "grad_norm": 3.2964580059051514, + "learning_rate": 1.9501393472252324e-05, + "loss": 1.2976, + "step": 445 + }, + { + "epoch": 0.33755912961210977, + "grad_norm": 3.227969169616699, + "learning_rate": 1.9498887014603937e-05, + "loss": 1.3191, + "step": 446 + }, + { + "epoch": 0.3383159886471145, + "grad_norm": 4.118795871734619, + "learning_rate": 1.949637443476243e-05, + "loss": 1.3112, + "step": 447 + }, + { + "epoch": 0.3390728476821192, + "grad_norm": 3.7260451316833496, + "learning_rate": 1.9493855734347367e-05, + "loss": 1.2836, + "step": 448 + }, + { + "epoch": 0.3398297067171239, + "grad_norm": 3.0048820972442627, + "learning_rate": 1.9491330914982265e-05, + "loss": 1.3106, + "step": 449 + }, + { + "epoch": 0.34058656575212864, + "grad_norm": 2.7483198642730713, + "learning_rate": 1.9488799978294586e-05, + "loss": 1.338, + "step": 450 + }, + { + "epoch": 0.3413434247871334, + "grad_norm": 3.021895170211792, + "learning_rate": 1.9486262925915736e-05, + "loss": 1.2931, + "step": 451 + }, + { + "epoch": 0.34210028382213814, + "grad_norm": 2.793663740158081, + "learning_rate": 1.948371975948106e-05, + "loss": 1.2895, + "step": 452 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 3.6046817302703857, + "learning_rate": 1.9481170480629835e-05, + "loss": 1.326, + "step": 453 + }, + { + "epoch": 0.3436140018921476, + "grad_norm": 2.8959131240844727, + "learning_rate": 1.9478615091005296e-05, + "loss": 1.3018, + "step": 454 + }, + { + "epoch": 0.3443708609271523, + "grad_norm": 2.869874954223633, + "learning_rate": 1.9476053592254608e-05, + "loss": 1.3181, + "step": 455 + }, + { + "epoch": 0.3451277199621571, + "grad_norm": 2.9448678493499756, + "learning_rate": 1.947348598602887e-05, + "loss": 1.2992, + "step": 456 + }, + { + "epoch": 0.3458845789971618, + "grad_norm": 2.8842031955718994, + "learning_rate": 1.9470912273983123e-05, + "loss": 1.3297, + "step": 457 + }, + { + "epoch": 0.3466414380321665, + "grad_norm": 3.329968214035034, + "learning_rate": 1.946833245777635e-05, + "loss": 1.3074, + "step": 458 + }, + { + "epoch": 0.34739829706717124, + "grad_norm": 2.8565642833709717, + "learning_rate": 1.9465746539071447e-05, + "loss": 1.3204, + "step": 459 + }, + { + "epoch": 0.34815515610217596, + "grad_norm": 3.0529487133026123, + "learning_rate": 1.946315451953527e-05, + "loss": 1.3249, + "step": 460 + }, + { + "epoch": 0.3489120151371807, + "grad_norm": 2.988011360168457, + "learning_rate": 1.946055640083859e-05, + "loss": 1.2612, + "step": 461 + }, + { + "epoch": 0.34966887417218545, + "grad_norm": 3.3266758918762207, + "learning_rate": 1.945795218465611e-05, + "loss": 1.3283, + "step": 462 + }, + { + "epoch": 0.35042573320719017, + "grad_norm": 3.2849862575531006, + "learning_rate": 1.945534187266648e-05, + "loss": 1.3476, + "step": 463 + }, + { + "epoch": 0.3511825922421949, + "grad_norm": 2.831113576889038, + "learning_rate": 1.945272546655226e-05, + "loss": 1.2726, + "step": 464 + }, + { + "epoch": 0.3519394512771996, + "grad_norm": 3.232224464416504, + "learning_rate": 1.9450102967999946e-05, + "loss": 1.3362, + "step": 465 + }, + { + "epoch": 0.35269631031220433, + "grad_norm": 3.704671621322632, + "learning_rate": 1.944747437869996e-05, + "loss": 1.3011, + "step": 466 + }, + { + "epoch": 0.3534531693472091, + "grad_norm": 2.6540513038635254, + "learning_rate": 1.944483970034665e-05, + "loss": 1.3268, + "step": 467 + }, + { + "epoch": 0.3542100283822138, + "grad_norm": 3.856849431991577, + "learning_rate": 1.944219893463829e-05, + "loss": 1.2762, + "step": 468 + }, + { + "epoch": 0.35496688741721855, + "grad_norm": 2.809225082397461, + "learning_rate": 1.943955208327708e-05, + "loss": 1.2515, + "step": 469 + }, + { + "epoch": 0.35572374645222327, + "grad_norm": 3.271754503250122, + "learning_rate": 1.943689914796914e-05, + "loss": 1.313, + "step": 470 + }, + { + "epoch": 0.356480605487228, + "grad_norm": 2.872096061706543, + "learning_rate": 1.9434240130424504e-05, + "loss": 1.2762, + "step": 471 + }, + { + "epoch": 0.35723746452223276, + "grad_norm": 2.9466817378997803, + "learning_rate": 1.9431575032357147e-05, + "loss": 1.3123, + "step": 472 + }, + { + "epoch": 0.3579943235572375, + "grad_norm": 3.358745813369751, + "learning_rate": 1.9428903855484938e-05, + "loss": 1.2684, + "step": 473 + }, + { + "epoch": 0.3587511825922422, + "grad_norm": 3.3290534019470215, + "learning_rate": 1.9426226601529685e-05, + "loss": 1.321, + "step": 474 + }, + { + "epoch": 0.3595080416272469, + "grad_norm": 3.1677582263946533, + "learning_rate": 1.9423543272217103e-05, + "loss": 1.2994, + "step": 475 + }, + { + "epoch": 0.36026490066225164, + "grad_norm": 3.891291618347168, + "learning_rate": 1.9420853869276822e-05, + "loss": 1.2783, + "step": 476 + }, + { + "epoch": 0.36102175969725636, + "grad_norm": 3.3545546531677246, + "learning_rate": 1.9418158394442395e-05, + "loss": 1.2985, + "step": 477 + }, + { + "epoch": 0.36177861873226114, + "grad_norm": 3.187551498413086, + "learning_rate": 1.941545684945128e-05, + "loss": 1.3401, + "step": 478 + }, + { + "epoch": 0.36253547776726586, + "grad_norm": 3.063565969467163, + "learning_rate": 1.9412749236044855e-05, + "loss": 1.2574, + "step": 479 + }, + { + "epoch": 0.3632923368022706, + "grad_norm": 3.0356266498565674, + "learning_rate": 1.9410035555968403e-05, + "loss": 1.2734, + "step": 480 + }, + { + "epoch": 0.3640491958372753, + "grad_norm": 4.256435871124268, + "learning_rate": 1.9407315810971123e-05, + "loss": 1.2623, + "step": 481 + }, + { + "epoch": 0.36480605487228, + "grad_norm": 3.298546075820923, + "learning_rate": 1.9404590002806122e-05, + "loss": 1.3079, + "step": 482 + }, + { + "epoch": 0.3655629139072848, + "grad_norm": 3.06703782081604, + "learning_rate": 1.9401858133230412e-05, + "loss": 1.347, + "step": 483 + }, + { + "epoch": 0.3663197729422895, + "grad_norm": 3.244100332260132, + "learning_rate": 1.9399120204004917e-05, + "loss": 1.298, + "step": 484 + }, + { + "epoch": 0.36707663197729423, + "grad_norm": 2.7238996028900146, + "learning_rate": 1.9396376216894462e-05, + "loss": 1.2434, + "step": 485 + }, + { + "epoch": 0.36783349101229895, + "grad_norm": 3.0345072746276855, + "learning_rate": 1.939362617366778e-05, + "loss": 1.3634, + "step": 486 + }, + { + "epoch": 0.36859035004730367, + "grad_norm": 3.2847676277160645, + "learning_rate": 1.9390870076097507e-05, + "loss": 1.3037, + "step": 487 + }, + { + "epoch": 0.36934720908230845, + "grad_norm": 2.8988196849823, + "learning_rate": 1.9388107925960183e-05, + "loss": 1.3137, + "step": 488 + }, + { + "epoch": 0.37010406811731317, + "grad_norm": 3.4510462284088135, + "learning_rate": 1.9385339725036244e-05, + "loss": 1.3042, + "step": 489 + }, + { + "epoch": 0.3708609271523179, + "grad_norm": 2.7349674701690674, + "learning_rate": 1.938256547511003e-05, + "loss": 1.3104, + "step": 490 + }, + { + "epoch": 0.3716177861873226, + "grad_norm": 2.6827545166015625, + "learning_rate": 1.9379785177969787e-05, + "loss": 1.3312, + "step": 491 + }, + { + "epoch": 0.3723746452223273, + "grad_norm": 2.871415138244629, + "learning_rate": 1.937699883540765e-05, + "loss": 1.2999, + "step": 492 + }, + { + "epoch": 0.37313150425733205, + "grad_norm": 3.261521339416504, + "learning_rate": 1.9374206449219646e-05, + "loss": 1.3027, + "step": 493 + }, + { + "epoch": 0.3738883632923368, + "grad_norm": 2.7188801765441895, + "learning_rate": 1.9371408021205708e-05, + "loss": 1.2688, + "step": 494 + }, + { + "epoch": 0.37464522232734154, + "grad_norm": 2.762587070465088, + "learning_rate": 1.936860355316967e-05, + "loss": 1.316, + "step": 495 + }, + { + "epoch": 0.37540208136234626, + "grad_norm": 3.2157773971557617, + "learning_rate": 1.9365793046919233e-05, + "loss": 1.2818, + "step": 496 + }, + { + "epoch": 0.376158940397351, + "grad_norm": 2.720599889755249, + "learning_rate": 1.9362976504266017e-05, + "loss": 1.2767, + "step": 497 + }, + { + "epoch": 0.3769157994323557, + "grad_norm": 2.728111982345581, + "learning_rate": 1.936015392702552e-05, + "loss": 1.3292, + "step": 498 + }, + { + "epoch": 0.3776726584673605, + "grad_norm": 3.2255611419677734, + "learning_rate": 1.9357325317017127e-05, + "loss": 1.3165, + "step": 499 + }, + { + "epoch": 0.3784295175023652, + "grad_norm": 3.559377431869507, + "learning_rate": 1.935449067606413e-05, + "loss": 1.3284, + "step": 500 + }, + { + "epoch": 0.3791863765373699, + "grad_norm": 3.0584166049957275, + "learning_rate": 1.935165000599368e-05, + "loss": 1.2788, + "step": 501 + }, + { + "epoch": 0.37994323557237464, + "grad_norm": 3.422832727432251, + "learning_rate": 1.9348803308636836e-05, + "loss": 1.3315, + "step": 502 + }, + { + "epoch": 0.38070009460737936, + "grad_norm": 3.052250623703003, + "learning_rate": 1.9345950585828543e-05, + "loss": 1.2772, + "step": 503 + }, + { + "epoch": 0.38145695364238413, + "grad_norm": 3.164451837539673, + "learning_rate": 1.9343091839407608e-05, + "loss": 1.2796, + "step": 504 + }, + { + "epoch": 0.38221381267738885, + "grad_norm": 2.815291166305542, + "learning_rate": 1.9340227071216747e-05, + "loss": 1.2473, + "step": 505 + }, + { + "epoch": 0.3829706717123936, + "grad_norm": 2.953880548477173, + "learning_rate": 1.9337356283102543e-05, + "loss": 1.299, + "step": 506 + }, + { + "epoch": 0.3837275307473983, + "grad_norm": 2.8606905937194824, + "learning_rate": 1.9334479476915462e-05, + "loss": 1.3075, + "step": 507 + }, + { + "epoch": 0.384484389782403, + "grad_norm": 2.8410329818725586, + "learning_rate": 1.9331596654509848e-05, + "loss": 1.3377, + "step": 508 + }, + { + "epoch": 0.38524124881740773, + "grad_norm": 3.284508228302002, + "learning_rate": 1.9328707817743923e-05, + "loss": 1.2549, + "step": 509 + }, + { + "epoch": 0.3859981078524125, + "grad_norm": 2.892754554748535, + "learning_rate": 1.9325812968479793e-05, + "loss": 1.292, + "step": 510 + }, + { + "epoch": 0.38675496688741723, + "grad_norm": 3.1711387634277344, + "learning_rate": 1.932291210858343e-05, + "loss": 1.2258, + "step": 511 + }, + { + "epoch": 0.38751182592242195, + "grad_norm": 2.6686558723449707, + "learning_rate": 1.932000523992468e-05, + "loss": 1.246, + "step": 512 + }, + { + "epoch": 0.38826868495742667, + "grad_norm": 2.947592258453369, + "learning_rate": 1.9317092364377273e-05, + "loss": 1.2544, + "step": 513 + }, + { + "epoch": 0.3890255439924314, + "grad_norm": 3.5232532024383545, + "learning_rate": 1.93141734838188e-05, + "loss": 1.3041, + "step": 514 + }, + { + "epoch": 0.38978240302743616, + "grad_norm": 3.5411503314971924, + "learning_rate": 1.931124860013073e-05, + "loss": 1.2852, + "step": 515 + }, + { + "epoch": 0.3905392620624409, + "grad_norm": 3.0337717533111572, + "learning_rate": 1.93083177151984e-05, + "loss": 1.2867, + "step": 516 + }, + { + "epoch": 0.3912961210974456, + "grad_norm": 3.082791805267334, + "learning_rate": 1.9305380830911002e-05, + "loss": 1.2981, + "step": 517 + }, + { + "epoch": 0.3920529801324503, + "grad_norm": 3.1100258827209473, + "learning_rate": 1.9302437949161622e-05, + "loss": 1.2645, + "step": 518 + }, + { + "epoch": 0.39280983916745504, + "grad_norm": 3.3867480754852295, + "learning_rate": 1.9299489071847185e-05, + "loss": 1.3555, + "step": 519 + }, + { + "epoch": 0.3935666982024598, + "grad_norm": 3.1625607013702393, + "learning_rate": 1.9296534200868504e-05, + "loss": 1.3111, + "step": 520 + }, + { + "epoch": 0.39432355723746454, + "grad_norm": 4.334456443786621, + "learning_rate": 1.929357333813023e-05, + "loss": 1.3102, + "step": 521 + }, + { + "epoch": 0.39508041627246926, + "grad_norm": 3.650447130203247, + "learning_rate": 1.9290606485540903e-05, + "loss": 1.3129, + "step": 522 + }, + { + "epoch": 0.395837275307474, + "grad_norm": 3.8689398765563965, + "learning_rate": 1.9287633645012898e-05, + "loss": 1.2974, + "step": 523 + }, + { + "epoch": 0.3965941343424787, + "grad_norm": 3.520089864730835, + "learning_rate": 1.9284654818462474e-05, + "loss": 1.291, + "step": 524 + }, + { + "epoch": 0.3973509933774834, + "grad_norm": 4.220740795135498, + "learning_rate": 1.9281670007809735e-05, + "loss": 1.3039, + "step": 525 + }, + { + "epoch": 0.3981078524124882, + "grad_norm": 3.871176242828369, + "learning_rate": 1.9278679214978637e-05, + "loss": 1.2682, + "step": 526 + }, + { + "epoch": 0.3988647114474929, + "grad_norm": 3.2446093559265137, + "learning_rate": 1.9275682441897007e-05, + "loss": 1.2866, + "step": 527 + }, + { + "epoch": 0.39962157048249763, + "grad_norm": 3.475529432296753, + "learning_rate": 1.9272679690496517e-05, + "loss": 1.344, + "step": 528 + }, + { + "epoch": 0.40037842951750235, + "grad_norm": 3.29640531539917, + "learning_rate": 1.9269670962712695e-05, + "loss": 1.3257, + "step": 529 + }, + { + "epoch": 0.4011352885525071, + "grad_norm": 3.43729305267334, + "learning_rate": 1.9266656260484925e-05, + "loss": 1.3504, + "step": 530 + }, + { + "epoch": 0.40189214758751185, + "grad_norm": 3.6663601398468018, + "learning_rate": 1.9263635585756424e-05, + "loss": 1.2738, + "step": 531 + }, + { + "epoch": 0.40264900662251657, + "grad_norm": 3.4716086387634277, + "learning_rate": 1.9260608940474293e-05, + "loss": 1.2997, + "step": 532 + }, + { + "epoch": 0.4034058656575213, + "grad_norm": 3.0576701164245605, + "learning_rate": 1.9257576326589448e-05, + "loss": 1.2958, + "step": 533 + }, + { + "epoch": 0.404162724692526, + "grad_norm": 3.7031450271606445, + "learning_rate": 1.9254537746056664e-05, + "loss": 1.2537, + "step": 534 + }, + { + "epoch": 0.40491958372753073, + "grad_norm": 3.070580005645752, + "learning_rate": 1.925149320083457e-05, + "loss": 1.3362, + "step": 535 + }, + { + "epoch": 0.4056764427625355, + "grad_norm": 3.241197347640991, + "learning_rate": 1.9248442692885634e-05, + "loss": 1.2984, + "step": 536 + }, + { + "epoch": 0.4064333017975402, + "grad_norm": 2.7833101749420166, + "learning_rate": 1.9245386224176162e-05, + "loss": 1.2589, + "step": 537 + }, + { + "epoch": 0.40719016083254495, + "grad_norm": 2.8053226470947266, + "learning_rate": 1.9242323796676313e-05, + "loss": 1.277, + "step": 538 + }, + { + "epoch": 0.40794701986754967, + "grad_norm": 3.119124412536621, + "learning_rate": 1.9239255412360075e-05, + "loss": 1.2516, + "step": 539 + }, + { + "epoch": 0.4087038789025544, + "grad_norm": 3.013762950897217, + "learning_rate": 1.923618107320529e-05, + "loss": 1.2988, + "step": 540 + }, + { + "epoch": 0.4094607379375591, + "grad_norm": 2.8327529430389404, + "learning_rate": 1.923310078119362e-05, + "loss": 1.2596, + "step": 541 + }, + { + "epoch": 0.4102175969725639, + "grad_norm": 2.7732462882995605, + "learning_rate": 1.9230014538310575e-05, + "loss": 1.2525, + "step": 542 + }, + { + "epoch": 0.4109744560075686, + "grad_norm": 2.984377145767212, + "learning_rate": 1.9226922346545513e-05, + "loss": 1.2688, + "step": 543 + }, + { + "epoch": 0.4117313150425733, + "grad_norm": 3.146101474761963, + "learning_rate": 1.92238242078916e-05, + "loss": 1.3291, + "step": 544 + }, + { + "epoch": 0.41248817407757804, + "grad_norm": 2.911142587661743, + "learning_rate": 1.9220720124345855e-05, + "loss": 1.2372, + "step": 545 + }, + { + "epoch": 0.41324503311258276, + "grad_norm": 3.006364345550537, + "learning_rate": 1.921761009790912e-05, + "loss": 1.2157, + "step": 546 + }, + { + "epoch": 0.41400189214758754, + "grad_norm": 2.9054133892059326, + "learning_rate": 1.9214494130586074e-05, + "loss": 1.3591, + "step": 547 + }, + { + "epoch": 0.41475875118259226, + "grad_norm": 2.9922358989715576, + "learning_rate": 1.9211372224385222e-05, + "loss": 1.3093, + "step": 548 + }, + { + "epoch": 0.415515610217597, + "grad_norm": 2.6461005210876465, + "learning_rate": 1.9208244381318892e-05, + "loss": 1.2585, + "step": 549 + }, + { + "epoch": 0.4162724692526017, + "grad_norm": 2.7143542766571045, + "learning_rate": 1.9205110603403247e-05, + "loss": 1.2594, + "step": 550 + }, + { + "epoch": 0.4170293282876064, + "grad_norm": 2.9333744049072266, + "learning_rate": 1.9201970892658273e-05, + "loss": 1.3178, + "step": 551 + }, + { + "epoch": 0.4177861873226112, + "grad_norm": 2.956841230392456, + "learning_rate": 1.919882525110778e-05, + "loss": 1.2745, + "step": 552 + }, + { + "epoch": 0.4185430463576159, + "grad_norm": 3.0672903060913086, + "learning_rate": 1.91956736807794e-05, + "loss": 1.2648, + "step": 553 + }, + { + "epoch": 0.41929990539262063, + "grad_norm": 2.7969796657562256, + "learning_rate": 1.9192516183704587e-05, + "loss": 1.3154, + "step": 554 + }, + { + "epoch": 0.42005676442762535, + "grad_norm": 2.9009835720062256, + "learning_rate": 1.9189352761918616e-05, + "loss": 1.2412, + "step": 555 + }, + { + "epoch": 0.42081362346263007, + "grad_norm": 2.8731672763824463, + "learning_rate": 1.918618341746058e-05, + "loss": 1.2811, + "step": 556 + }, + { + "epoch": 0.4215704824976348, + "grad_norm": 2.7065563201904297, + "learning_rate": 1.918300815237339e-05, + "loss": 1.2895, + "step": 557 + }, + { + "epoch": 0.42232734153263957, + "grad_norm": 2.670109748840332, + "learning_rate": 1.9179826968703775e-05, + "loss": 1.2809, + "step": 558 + }, + { + "epoch": 0.4230842005676443, + "grad_norm": 2.9249067306518555, + "learning_rate": 1.9176639868502273e-05, + "loss": 1.3528, + "step": 559 + }, + { + "epoch": 0.423841059602649, + "grad_norm": 2.733651638031006, + "learning_rate": 1.917344685382325e-05, + "loss": 1.2516, + "step": 560 + }, + { + "epoch": 0.4245979186376537, + "grad_norm": 3.126077651977539, + "learning_rate": 1.9170247926724863e-05, + "loss": 1.3048, + "step": 561 + }, + { + "epoch": 0.42535477767265845, + "grad_norm": 3.024705648422241, + "learning_rate": 1.9167043089269096e-05, + "loss": 1.2871, + "step": 562 + }, + { + "epoch": 0.4261116367076632, + "grad_norm": 3.0809972286224365, + "learning_rate": 1.916383234352174e-05, + "loss": 1.2939, + "step": 563 + }, + { + "epoch": 0.42686849574266794, + "grad_norm": 2.8006155490875244, + "learning_rate": 1.9160615691552388e-05, + "loss": 1.2681, + "step": 564 + }, + { + "epoch": 0.42762535477767266, + "grad_norm": 3.146348714828491, + "learning_rate": 1.915739313543445e-05, + "loss": 1.299, + "step": 565 + }, + { + "epoch": 0.4283822138126774, + "grad_norm": 2.707672119140625, + "learning_rate": 1.915416467724514e-05, + "loss": 1.305, + "step": 566 + }, + { + "epoch": 0.4291390728476821, + "grad_norm": 3.0839362144470215, + "learning_rate": 1.9150930319065465e-05, + "loss": 1.2806, + "step": 567 + }, + { + "epoch": 0.4298959318826869, + "grad_norm": 2.6987831592559814, + "learning_rate": 1.9147690062980243e-05, + "loss": 1.2449, + "step": 568 + }, + { + "epoch": 0.4306527909176916, + "grad_norm": 3.5137927532196045, + "learning_rate": 1.9144443911078098e-05, + "loss": 1.2525, + "step": 569 + }, + { + "epoch": 0.4314096499526963, + "grad_norm": 2.656526803970337, + "learning_rate": 1.914119186545145e-05, + "loss": 1.2801, + "step": 570 + }, + { + "epoch": 0.43216650898770104, + "grad_norm": 2.7091798782348633, + "learning_rate": 1.9137933928196514e-05, + "loss": 1.2743, + "step": 571 + }, + { + "epoch": 0.43292336802270576, + "grad_norm": 2.6860084533691406, + "learning_rate": 1.913467010141331e-05, + "loss": 1.2569, + "step": 572 + }, + { + "epoch": 0.4336802270577105, + "grad_norm": 2.8987984657287598, + "learning_rate": 1.9131400387205653e-05, + "loss": 1.2411, + "step": 573 + }, + { + "epoch": 0.43443708609271525, + "grad_norm": 2.579749584197998, + "learning_rate": 1.9128124787681145e-05, + "loss": 1.2344, + "step": 574 + }, + { + "epoch": 0.43519394512771997, + "grad_norm": 2.835766553878784, + "learning_rate": 1.912484330495119e-05, + "loss": 1.2922, + "step": 575 + }, + { + "epoch": 0.4359508041627247, + "grad_norm": 3.549691915512085, + "learning_rate": 1.9121555941130986e-05, + "loss": 1.2908, + "step": 576 + }, + { + "epoch": 0.4367076631977294, + "grad_norm": 2.881730556488037, + "learning_rate": 1.911826269833951e-05, + "loss": 1.2787, + "step": 577 + }, + { + "epoch": 0.43746452223273413, + "grad_norm": 2.881334066390991, + "learning_rate": 1.9114963578699538e-05, + "loss": 1.3111, + "step": 578 + }, + { + "epoch": 0.4382213812677389, + "grad_norm": 2.941556453704834, + "learning_rate": 1.911165858433764e-05, + "loss": 1.2857, + "step": 579 + }, + { + "epoch": 0.4389782403027436, + "grad_norm": 2.6916472911834717, + "learning_rate": 1.9108347717384156e-05, + "loss": 1.2512, + "step": 580 + }, + { + "epoch": 0.43973509933774835, + "grad_norm": 3.0234310626983643, + "learning_rate": 1.9105030979973223e-05, + "loss": 1.2089, + "step": 581 + }, + { + "epoch": 0.44049195837275307, + "grad_norm": 2.7675161361694336, + "learning_rate": 1.9101708374242764e-05, + "loss": 1.3253, + "step": 582 + }, + { + "epoch": 0.4412488174077578, + "grad_norm": 2.746612310409546, + "learning_rate": 1.909837990233447e-05, + "loss": 1.2554, + "step": 583 + }, + { + "epoch": 0.44200567644276256, + "grad_norm": 2.629913091659546, + "learning_rate": 1.9095045566393834e-05, + "loss": 1.3158, + "step": 584 + }, + { + "epoch": 0.4427625354777673, + "grad_norm": 3.0382394790649414, + "learning_rate": 1.909170536857011e-05, + "loss": 1.3382, + "step": 585 + }, + { + "epoch": 0.443519394512772, + "grad_norm": 3.1332645416259766, + "learning_rate": 1.908835931101634e-05, + "loss": 1.2561, + "step": 586 + }, + { + "epoch": 0.4442762535477767, + "grad_norm": 2.91369891166687, + "learning_rate": 1.9085007395889342e-05, + "loss": 1.287, + "step": 587 + }, + { + "epoch": 0.44503311258278144, + "grad_norm": 2.6690382957458496, + "learning_rate": 1.9081649625349715e-05, + "loss": 1.275, + "step": 588 + }, + { + "epoch": 0.44578997161778616, + "grad_norm": 2.7576904296875, + "learning_rate": 1.9078286001561822e-05, + "loss": 1.2669, + "step": 589 + }, + { + "epoch": 0.44654683065279094, + "grad_norm": 2.731320381164551, + "learning_rate": 1.9074916526693804e-05, + "loss": 1.292, + "step": 590 + }, + { + "epoch": 0.44730368968779566, + "grad_norm": 2.6240909099578857, + "learning_rate": 1.9071541202917572e-05, + "loss": 1.2852, + "step": 591 + }, + { + "epoch": 0.4480605487228004, + "grad_norm": 2.8189620971679688, + "learning_rate": 1.906816003240881e-05, + "loss": 1.2655, + "step": 592 + }, + { + "epoch": 0.4488174077578051, + "grad_norm": 2.7323951721191406, + "learning_rate": 1.906477301734697e-05, + "loss": 1.2942, + "step": 593 + }, + { + "epoch": 0.4495742667928098, + "grad_norm": 2.8606555461883545, + "learning_rate": 1.9061380159915262e-05, + "loss": 1.3039, + "step": 594 + }, + { + "epoch": 0.4503311258278146, + "grad_norm": 2.7523887157440186, + "learning_rate": 1.9057981462300683e-05, + "loss": 1.2372, + "step": 595 + }, + { + "epoch": 0.4510879848628193, + "grad_norm": 3.1251001358032227, + "learning_rate": 1.9054576926693977e-05, + "loss": 1.2726, + "step": 596 + }, + { + "epoch": 0.45184484389782403, + "grad_norm": 3.1092488765716553, + "learning_rate": 1.9051166555289652e-05, + "loss": 1.3126, + "step": 597 + }, + { + "epoch": 0.45260170293282875, + "grad_norm": 2.722238302230835, + "learning_rate": 1.904775035028598e-05, + "loss": 1.2765, + "step": 598 + }, + { + "epoch": 0.4533585619678335, + "grad_norm": 3.9474592208862305, + "learning_rate": 1.9044328313885e-05, + "loss": 1.2389, + "step": 599 + }, + { + "epoch": 0.45411542100283825, + "grad_norm": 2.7783472537994385, + "learning_rate": 1.90409004482925e-05, + "loss": 1.2683, + "step": 600 + }, + { + "epoch": 0.45487228003784297, + "grad_norm": 2.7635014057159424, + "learning_rate": 1.9037466755718038e-05, + "loss": 1.3073, + "step": 601 + }, + { + "epoch": 0.4556291390728477, + "grad_norm": 2.899637222290039, + "learning_rate": 1.903402723837491e-05, + "loss": 1.2682, + "step": 602 + }, + { + "epoch": 0.4563859981078524, + "grad_norm": 2.5725064277648926, + "learning_rate": 1.9030581898480182e-05, + "loss": 1.2445, + "step": 603 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 2.767765760421753, + "learning_rate": 1.902713073825467e-05, + "loss": 1.3006, + "step": 604 + }, + { + "epoch": 0.45789971617786185, + "grad_norm": 2.7437305450439453, + "learning_rate": 1.902367375992293e-05, + "loss": 1.256, + "step": 605 + }, + { + "epoch": 0.4586565752128666, + "grad_norm": 2.764497756958008, + "learning_rate": 1.9020210965713287e-05, + "loss": 1.2316, + "step": 606 + }, + { + "epoch": 0.45941343424787134, + "grad_norm": 2.6510708332061768, + "learning_rate": 1.9016742357857802e-05, + "loss": 1.2413, + "step": 607 + }, + { + "epoch": 0.46017029328287606, + "grad_norm": 2.727973699569702, + "learning_rate": 1.9013267938592282e-05, + "loss": 1.2779, + "step": 608 + }, + { + "epoch": 0.4609271523178808, + "grad_norm": 2.7336103916168213, + "learning_rate": 1.900978771015629e-05, + "loss": 1.3133, + "step": 609 + }, + { + "epoch": 0.4616840113528855, + "grad_norm": 2.635427713394165, + "learning_rate": 1.9006301674793128e-05, + "loss": 1.233, + "step": 610 + }, + { + "epoch": 0.4624408703878903, + "grad_norm": 2.99351167678833, + "learning_rate": 1.900280983474984e-05, + "loss": 1.2353, + "step": 611 + }, + { + "epoch": 0.463197729422895, + "grad_norm": 3.155054807662964, + "learning_rate": 1.8999312192277217e-05, + "loss": 1.3258, + "step": 612 + }, + { + "epoch": 0.4639545884578997, + "grad_norm": 2.745626926422119, + "learning_rate": 1.8995808749629773e-05, + "loss": 1.2321, + "step": 613 + }, + { + "epoch": 0.46471144749290444, + "grad_norm": 2.662928819656372, + "learning_rate": 1.899229950906579e-05, + "loss": 1.2291, + "step": 614 + }, + { + "epoch": 0.46546830652790916, + "grad_norm": 2.684296131134033, + "learning_rate": 1.8988784472847262e-05, + "loss": 1.2575, + "step": 615 + }, + { + "epoch": 0.46622516556291393, + "grad_norm": 2.850404977798462, + "learning_rate": 1.8985263643239932e-05, + "loss": 1.2727, + "step": 616 + }, + { + "epoch": 0.46698202459791865, + "grad_norm": 2.8185768127441406, + "learning_rate": 1.8981737022513268e-05, + "loss": 1.2145, + "step": 617 + }, + { + "epoch": 0.4677388836329234, + "grad_norm": 2.865675449371338, + "learning_rate": 1.8978204612940476e-05, + "loss": 1.2602, + "step": 618 + }, + { + "epoch": 0.4684957426679281, + "grad_norm": 2.706779718399048, + "learning_rate": 1.8974666416798496e-05, + "loss": 1.2578, + "step": 619 + }, + { + "epoch": 0.4692526017029328, + "grad_norm": 2.7865641117095947, + "learning_rate": 1.8971122436368002e-05, + "loss": 1.2549, + "step": 620 + }, + { + "epoch": 0.47000946073793753, + "grad_norm": 3.0289227962493896, + "learning_rate": 1.8967572673933373e-05, + "loss": 1.2794, + "step": 621 + }, + { + "epoch": 0.4707663197729423, + "grad_norm": 2.986976146697998, + "learning_rate": 1.8964017131782748e-05, + "loss": 1.2666, + "step": 622 + }, + { + "epoch": 0.47152317880794703, + "grad_norm": 2.907590866088867, + "learning_rate": 1.896045581220797e-05, + "loss": 1.3149, + "step": 623 + }, + { + "epoch": 0.47228003784295175, + "grad_norm": 2.5124711990356445, + "learning_rate": 1.8956888717504607e-05, + "loss": 1.2692, + "step": 624 + }, + { + "epoch": 0.47303689687795647, + "grad_norm": 2.8450794219970703, + "learning_rate": 1.8953315849971956e-05, + "loss": 1.2385, + "step": 625 + }, + { + "epoch": 0.4737937559129612, + "grad_norm": 3.127713441848755, + "learning_rate": 1.8949737211913038e-05, + "loss": 1.2725, + "step": 626 + }, + { + "epoch": 0.47455061494796597, + "grad_norm": 3.0674550533294678, + "learning_rate": 1.894615280563458e-05, + "loss": 1.3016, + "step": 627 + }, + { + "epoch": 0.4753074739829707, + "grad_norm": 3.29008150100708, + "learning_rate": 1.894256263344704e-05, + "loss": 1.2382, + "step": 628 + }, + { + "epoch": 0.4760643330179754, + "grad_norm": 3.2081003189086914, + "learning_rate": 1.8938966697664592e-05, + "loss": 1.259, + "step": 629 + }, + { + "epoch": 0.4768211920529801, + "grad_norm": 2.922011613845825, + "learning_rate": 1.8935365000605116e-05, + "loss": 1.3017, + "step": 630 + }, + { + "epoch": 0.47757805108798485, + "grad_norm": 3.075958490371704, + "learning_rate": 1.893175754459021e-05, + "loss": 1.2595, + "step": 631 + }, + { + "epoch": 0.4783349101229896, + "grad_norm": 2.9022579193115234, + "learning_rate": 1.892814433194519e-05, + "loss": 1.3033, + "step": 632 + }, + { + "epoch": 0.47909176915799434, + "grad_norm": 2.9433717727661133, + "learning_rate": 1.8924525364999077e-05, + "loss": 1.2636, + "step": 633 + }, + { + "epoch": 0.47984862819299906, + "grad_norm": 2.9550983905792236, + "learning_rate": 1.89209006460846e-05, + "loss": 1.2936, + "step": 634 + }, + { + "epoch": 0.4806054872280038, + "grad_norm": 2.8603897094726562, + "learning_rate": 1.8917270177538198e-05, + "loss": 1.2497, + "step": 635 + }, + { + "epoch": 0.4813623462630085, + "grad_norm": 3.0159318447113037, + "learning_rate": 1.8913633961700014e-05, + "loss": 1.2627, + "step": 636 + }, + { + "epoch": 0.4821192052980132, + "grad_norm": 3.3943378925323486, + "learning_rate": 1.8909992000913896e-05, + "loss": 1.2977, + "step": 637 + }, + { + "epoch": 0.482876064333018, + "grad_norm": 2.8387339115142822, + "learning_rate": 1.8906344297527403e-05, + "loss": 1.2922, + "step": 638 + }, + { + "epoch": 0.4836329233680227, + "grad_norm": 2.8385610580444336, + "learning_rate": 1.8902690853891787e-05, + "loss": 1.2023, + "step": 639 + }, + { + "epoch": 0.48438978240302744, + "grad_norm": 3.155811309814453, + "learning_rate": 1.8899031672362e-05, + "loss": 1.3069, + "step": 640 + }, + { + "epoch": 0.48514664143803216, + "grad_norm": 3.442098617553711, + "learning_rate": 1.8895366755296693e-05, + "loss": 1.2361, + "step": 641 + }, + { + "epoch": 0.4859035004730369, + "grad_norm": 2.805680751800537, + "learning_rate": 1.8891696105058218e-05, + "loss": 1.2349, + "step": 642 + }, + { + "epoch": 0.48666035950804165, + "grad_norm": 2.7870709896087646, + "learning_rate": 1.8888019724012618e-05, + "loss": 1.3326, + "step": 643 + }, + { + "epoch": 0.48741721854304637, + "grad_norm": 2.8645455837249756, + "learning_rate": 1.8884337614529636e-05, + "loss": 1.2829, + "step": 644 + }, + { + "epoch": 0.4881740775780511, + "grad_norm": 2.8770759105682373, + "learning_rate": 1.88806497789827e-05, + "loss": 1.2268, + "step": 645 + }, + { + "epoch": 0.4889309366130558, + "grad_norm": 2.8018059730529785, + "learning_rate": 1.8876956219748934e-05, + "loss": 1.2566, + "step": 646 + }, + { + "epoch": 0.48968779564806053, + "grad_norm": 3.0624117851257324, + "learning_rate": 1.887325693920915e-05, + "loss": 1.2776, + "step": 647 + }, + { + "epoch": 0.4904446546830653, + "grad_norm": 2.7411904335021973, + "learning_rate": 1.886955193974785e-05, + "loss": 1.2941, + "step": 648 + }, + { + "epoch": 0.49120151371807, + "grad_norm": 2.4694104194641113, + "learning_rate": 1.8865841223753216e-05, + "loss": 1.245, + "step": 649 + }, + { + "epoch": 0.49195837275307475, + "grad_norm": 2.4889931678771973, + "learning_rate": 1.886212479361712e-05, + "loss": 1.2664, + "step": 650 + }, + { + "epoch": 0.49271523178807947, + "grad_norm": 2.699221134185791, + "learning_rate": 1.885840265173512e-05, + "loss": 1.245, + "step": 651 + }, + { + "epoch": 0.4934720908230842, + "grad_norm": 3.0901527404785156, + "learning_rate": 1.8854674800506447e-05, + "loss": 1.2683, + "step": 652 + }, + { + "epoch": 0.4942289498580889, + "grad_norm": 2.5710549354553223, + "learning_rate": 1.8850941242334024e-05, + "loss": 1.2677, + "step": 653 + }, + { + "epoch": 0.4949858088930937, + "grad_norm": 2.747673988342285, + "learning_rate": 1.8847201979624433e-05, + "loss": 1.2487, + "step": 654 + }, + { + "epoch": 0.4957426679280984, + "grad_norm": 2.6453075408935547, + "learning_rate": 1.8843457014787954e-05, + "loss": 1.2534, + "step": 655 + }, + { + "epoch": 0.4964995269631031, + "grad_norm": 2.3280134201049805, + "learning_rate": 1.8839706350238537e-05, + "loss": 1.2529, + "step": 656 + }, + { + "epoch": 0.49725638599810784, + "grad_norm": 2.353527307510376, + "learning_rate": 1.88359499883938e-05, + "loss": 1.2612, + "step": 657 + }, + { + "epoch": 0.49801324503311256, + "grad_norm": 2.827341318130493, + "learning_rate": 1.8832187931675036e-05, + "loss": 1.2883, + "step": 658 + }, + { + "epoch": 0.49877010406811734, + "grad_norm": 2.620957374572754, + "learning_rate": 1.882842018250721e-05, + "loss": 1.25, + "step": 659 + }, + { + "epoch": 0.49952696310312206, + "grad_norm": 2.600372076034546, + "learning_rate": 1.8824646743318955e-05, + "loss": 1.2497, + "step": 660 + }, + { + "epoch": 0.5002838221381267, + "grad_norm": 2.544832706451416, + "learning_rate": 1.882086761654257e-05, + "loss": 1.2656, + "step": 661 + }, + { + "epoch": 0.5010406811731315, + "grad_norm": 2.809065818786621, + "learning_rate": 1.881708280461403e-05, + "loss": 1.3098, + "step": 662 + }, + { + "epoch": 0.5017975402081363, + "grad_norm": 2.423124313354492, + "learning_rate": 1.881329230997296e-05, + "loss": 1.2676, + "step": 663 + }, + { + "epoch": 0.5025543992431409, + "grad_norm": 2.6886796951293945, + "learning_rate": 1.880949613506266e-05, + "loss": 1.2764, + "step": 664 + }, + { + "epoch": 0.5033112582781457, + "grad_norm": 2.9043877124786377, + "learning_rate": 1.8805694282330076e-05, + "loss": 1.2499, + "step": 665 + }, + { + "epoch": 0.5040681173131504, + "grad_norm": 2.5381906032562256, + "learning_rate": 1.880188675422584e-05, + "loss": 1.2429, + "step": 666 + }, + { + "epoch": 0.5048249763481552, + "grad_norm": 2.5368845462799072, + "learning_rate": 1.8798073553204216e-05, + "loss": 1.2992, + "step": 667 + }, + { + "epoch": 0.5055818353831599, + "grad_norm": 2.313969850540161, + "learning_rate": 1.879425468172314e-05, + "loss": 1.2602, + "step": 668 + }, + { + "epoch": 0.5063386944181646, + "grad_norm": 2.473052978515625, + "learning_rate": 1.8790430142244192e-05, + "loss": 1.2558, + "step": 669 + }, + { + "epoch": 0.5070955534531694, + "grad_norm": 2.5860140323638916, + "learning_rate": 1.878659993723262e-05, + "loss": 1.2489, + "step": 670 + }, + { + "epoch": 0.507852412488174, + "grad_norm": 2.7334864139556885, + "learning_rate": 1.8782764069157307e-05, + "loss": 1.2892, + "step": 671 + }, + { + "epoch": 0.5086092715231788, + "grad_norm": 2.7741503715515137, + "learning_rate": 1.8778922540490803e-05, + "loss": 1.214, + "step": 672 + }, + { + "epoch": 0.5093661305581836, + "grad_norm": 2.3246145248413086, + "learning_rate": 1.8775075353709294e-05, + "loss": 1.2301, + "step": 673 + }, + { + "epoch": 0.5101229895931882, + "grad_norm": 2.879974365234375, + "learning_rate": 1.8771222511292622e-05, + "loss": 1.2351, + "step": 674 + }, + { + "epoch": 0.510879848628193, + "grad_norm": 2.5754384994506836, + "learning_rate": 1.8767364015724266e-05, + "loss": 1.2701, + "step": 675 + }, + { + "epoch": 0.5116367076631977, + "grad_norm": 2.623716115951538, + "learning_rate": 1.8763499869491356e-05, + "loss": 1.2934, + "step": 676 + }, + { + "epoch": 0.5123935666982025, + "grad_norm": 2.6354804039001465, + "learning_rate": 1.8759630075084664e-05, + "loss": 1.2454, + "step": 677 + }, + { + "epoch": 0.5131504257332072, + "grad_norm": 2.550604820251465, + "learning_rate": 1.8755754634998593e-05, + "loss": 1.2555, + "step": 678 + }, + { + "epoch": 0.5139072847682119, + "grad_norm": 2.5519111156463623, + "learning_rate": 1.8751873551731196e-05, + "loss": 1.2384, + "step": 679 + }, + { + "epoch": 0.5146641438032167, + "grad_norm": 2.6348938941955566, + "learning_rate": 1.8747986827784167e-05, + "loss": 1.2453, + "step": 680 + }, + { + "epoch": 0.5154210028382213, + "grad_norm": 2.5110082626342773, + "learning_rate": 1.874409446566282e-05, + "loss": 1.3047, + "step": 681 + }, + { + "epoch": 0.5161778618732261, + "grad_norm": 2.5216503143310547, + "learning_rate": 1.8740196467876114e-05, + "loss": 1.2464, + "step": 682 + }, + { + "epoch": 0.5169347209082309, + "grad_norm": 2.737325668334961, + "learning_rate": 1.8736292836936643e-05, + "loss": 1.2666, + "step": 683 + }, + { + "epoch": 0.5176915799432356, + "grad_norm": 2.625519037246704, + "learning_rate": 1.8732383575360625e-05, + "loss": 1.2403, + "step": 684 + }, + { + "epoch": 0.5184484389782403, + "grad_norm": 2.784569263458252, + "learning_rate": 1.8728468685667914e-05, + "loss": 1.2627, + "step": 685 + }, + { + "epoch": 0.519205298013245, + "grad_norm": 2.7349774837493896, + "learning_rate": 1.8724548170381983e-05, + "loss": 1.2771, + "step": 686 + }, + { + "epoch": 0.5199621570482498, + "grad_norm": 2.681603193283081, + "learning_rate": 1.8720622032029936e-05, + "loss": 1.276, + "step": 687 + }, + { + "epoch": 0.5207190160832545, + "grad_norm": 2.767359972000122, + "learning_rate": 1.8716690273142504e-05, + "loss": 1.2279, + "step": 688 + }, + { + "epoch": 0.5214758751182592, + "grad_norm": 2.5928122997283936, + "learning_rate": 1.871275289625404e-05, + "loss": 1.2568, + "step": 689 + }, + { + "epoch": 0.522232734153264, + "grad_norm": 2.6970558166503906, + "learning_rate": 1.8708809903902517e-05, + "loss": 1.3101, + "step": 690 + }, + { + "epoch": 0.5229895931882687, + "grad_norm": 2.6737709045410156, + "learning_rate": 1.8704861298629524e-05, + "loss": 1.2575, + "step": 691 + }, + { + "epoch": 0.5237464522232734, + "grad_norm": 3.0363659858703613, + "learning_rate": 1.870090708298028e-05, + "loss": 1.3034, + "step": 692 + }, + { + "epoch": 0.5245033112582781, + "grad_norm": 2.817183017730713, + "learning_rate": 1.8696947259503603e-05, + "loss": 1.2962, + "step": 693 + }, + { + "epoch": 0.5252601702932829, + "grad_norm": 3.507577896118164, + "learning_rate": 1.8692981830751937e-05, + "loss": 1.2643, + "step": 694 + }, + { + "epoch": 0.5260170293282876, + "grad_norm": 2.9019994735717773, + "learning_rate": 1.868901079928134e-05, + "loss": 1.2968, + "step": 695 + }, + { + "epoch": 0.5267738883632923, + "grad_norm": 2.6820502281188965, + "learning_rate": 1.8685034167651477e-05, + "loss": 1.281, + "step": 696 + }, + { + "epoch": 0.5275307473982971, + "grad_norm": 2.5685501098632812, + "learning_rate": 1.8681051938425626e-05, + "loss": 1.2368, + "step": 697 + }, + { + "epoch": 0.5282876064333017, + "grad_norm": 2.943498134613037, + "learning_rate": 1.867706411417067e-05, + "loss": 1.2494, + "step": 698 + }, + { + "epoch": 0.5290444654683065, + "grad_norm": 2.9893808364868164, + "learning_rate": 1.8673070697457097e-05, + "loss": 1.3033, + "step": 699 + }, + { + "epoch": 0.5298013245033113, + "grad_norm": 3.192913293838501, + "learning_rate": 1.8669071690859002e-05, + "loss": 1.3122, + "step": 700 + }, + { + "epoch": 0.530558183538316, + "grad_norm": 2.6208715438842773, + "learning_rate": 1.866506709695409e-05, + "loss": 1.2335, + "step": 701 + }, + { + "epoch": 0.5313150425733207, + "grad_norm": 2.793226718902588, + "learning_rate": 1.8661056918323654e-05, + "loss": 1.2721, + "step": 702 + }, + { + "epoch": 0.5320719016083254, + "grad_norm": 2.809190034866333, + "learning_rate": 1.8657041157552597e-05, + "loss": 1.2318, + "step": 703 + }, + { + "epoch": 0.5328287606433302, + "grad_norm": 2.70646595954895, + "learning_rate": 1.865301981722942e-05, + "loss": 1.2471, + "step": 704 + }, + { + "epoch": 0.533585619678335, + "grad_norm": 2.691943407058716, + "learning_rate": 1.864899289994621e-05, + "loss": 1.2765, + "step": 705 + }, + { + "epoch": 0.5343424787133396, + "grad_norm": 2.6376893520355225, + "learning_rate": 1.864496040829867e-05, + "loss": 1.2932, + "step": 706 + }, + { + "epoch": 0.5350993377483444, + "grad_norm": 2.727936029434204, + "learning_rate": 1.8640922344886066e-05, + "loss": 1.2056, + "step": 707 + }, + { + "epoch": 0.5358561967833491, + "grad_norm": 2.599090337753296, + "learning_rate": 1.863687871231128e-05, + "loss": 1.2747, + "step": 708 + }, + { + "epoch": 0.5366130558183538, + "grad_norm": 2.9305431842803955, + "learning_rate": 1.863282951318078e-05, + "loss": 1.2593, + "step": 709 + }, + { + "epoch": 0.5373699148533586, + "grad_norm": 2.5242085456848145, + "learning_rate": 1.8628774750104615e-05, + "loss": 1.2669, + "step": 710 + }, + { + "epoch": 0.5381267738883633, + "grad_norm": 2.737729787826538, + "learning_rate": 1.862471442569642e-05, + "loss": 1.2515, + "step": 711 + }, + { + "epoch": 0.538883632923368, + "grad_norm": 2.8515143394470215, + "learning_rate": 1.8620648542573423e-05, + "loss": 1.2483, + "step": 712 + }, + { + "epoch": 0.5396404919583727, + "grad_norm": 2.8016417026519775, + "learning_rate": 1.8616577103356425e-05, + "loss": 1.2389, + "step": 713 + }, + { + "epoch": 0.5403973509933775, + "grad_norm": 2.9451699256896973, + "learning_rate": 1.861250011066982e-05, + "loss": 1.2345, + "step": 714 + }, + { + "epoch": 0.5411542100283823, + "grad_norm": 2.771279811859131, + "learning_rate": 1.8608417567141572e-05, + "loss": 1.2621, + "step": 715 + }, + { + "epoch": 0.5419110690633869, + "grad_norm": 2.9805190563201904, + "learning_rate": 1.860432947540322e-05, + "loss": 1.2348, + "step": 716 + }, + { + "epoch": 0.5426679280983917, + "grad_norm": 2.803847312927246, + "learning_rate": 1.8600235838089896e-05, + "loss": 1.241, + "step": 717 + }, + { + "epoch": 0.5434247871333964, + "grad_norm": 2.4871954917907715, + "learning_rate": 1.859613665784029e-05, + "loss": 1.2883, + "step": 718 + }, + { + "epoch": 0.5441816461684011, + "grad_norm": 3.067754030227661, + "learning_rate": 1.8592031937296673e-05, + "loss": 1.2833, + "step": 719 + }, + { + "epoch": 0.5449385052034059, + "grad_norm": 2.8348135948181152, + "learning_rate": 1.8587921679104887e-05, + "loss": 1.3083, + "step": 720 + }, + { + "epoch": 0.5456953642384106, + "grad_norm": 2.538663387298584, + "learning_rate": 1.8583805885914345e-05, + "loss": 1.2288, + "step": 721 + }, + { + "epoch": 0.5464522232734154, + "grad_norm": 2.7975425720214844, + "learning_rate": 1.857968456037801e-05, + "loss": 1.3166, + "step": 722 + }, + { + "epoch": 0.54720908230842, + "grad_norm": 2.60284423828125, + "learning_rate": 1.857555770515244e-05, + "loss": 1.251, + "step": 723 + }, + { + "epoch": 0.5479659413434248, + "grad_norm": 3.0047545433044434, + "learning_rate": 1.857142532289774e-05, + "loss": 1.2372, + "step": 724 + }, + { + "epoch": 0.5487228003784295, + "grad_norm": 2.7439827919006348, + "learning_rate": 1.8567287416277576e-05, + "loss": 1.2686, + "step": 725 + }, + { + "epoch": 0.5494796594134342, + "grad_norm": 2.7966012954711914, + "learning_rate": 1.856314398795918e-05, + "loss": 1.2997, + "step": 726 + }, + { + "epoch": 0.550236518448439, + "grad_norm": 2.4072394371032715, + "learning_rate": 1.855899504061335e-05, + "loss": 1.2371, + "step": 727 + }, + { + "epoch": 0.5509933774834437, + "grad_norm": 2.6710758209228516, + "learning_rate": 1.8554840576914425e-05, + "loss": 1.3084, + "step": 728 + }, + { + "epoch": 0.5517502365184485, + "grad_norm": 2.4834091663360596, + "learning_rate": 1.8550680599540315e-05, + "loss": 1.2335, + "step": 729 + }, + { + "epoch": 0.5525070955534531, + "grad_norm": 3.0747454166412354, + "learning_rate": 1.8546515111172475e-05, + "loss": 1.2691, + "step": 730 + }, + { + "epoch": 0.5532639545884579, + "grad_norm": 2.3881189823150635, + "learning_rate": 1.8542344114495918e-05, + "loss": 1.2852, + "step": 731 + }, + { + "epoch": 0.5540208136234627, + "grad_norm": 2.559795618057251, + "learning_rate": 1.85381676121992e-05, + "loss": 1.2266, + "step": 732 + }, + { + "epoch": 0.5547776726584673, + "grad_norm": 2.5426385402679443, + "learning_rate": 1.8533985606974436e-05, + "loss": 1.2136, + "step": 733 + }, + { + "epoch": 0.5555345316934721, + "grad_norm": 2.7627816200256348, + "learning_rate": 1.8529798101517283e-05, + "loss": 1.227, + "step": 734 + }, + { + "epoch": 0.5562913907284768, + "grad_norm": 3.559936285018921, + "learning_rate": 1.8525605098526935e-05, + "loss": 1.2823, + "step": 735 + }, + { + "epoch": 0.5570482497634816, + "grad_norm": 2.6380114555358887, + "learning_rate": 1.8521406600706146e-05, + "loss": 1.2077, + "step": 736 + }, + { + "epoch": 0.5578051087984863, + "grad_norm": 2.3080461025238037, + "learning_rate": 1.8517202610761203e-05, + "loss": 1.2146, + "step": 737 + }, + { + "epoch": 0.558561967833491, + "grad_norm": 2.245431423187256, + "learning_rate": 1.851299313140193e-05, + "loss": 1.2073, + "step": 738 + }, + { + "epoch": 0.5593188268684958, + "grad_norm": 2.4832706451416016, + "learning_rate": 1.8508778165341697e-05, + "loss": 1.2167, + "step": 739 + }, + { + "epoch": 0.5600756859035004, + "grad_norm": 2.646280288696289, + "learning_rate": 1.85045577152974e-05, + "loss": 1.2379, + "step": 740 + }, + { + "epoch": 0.5608325449385052, + "grad_norm": 2.449310302734375, + "learning_rate": 1.8500331783989486e-05, + "loss": 1.2085, + "step": 741 + }, + { + "epoch": 0.56158940397351, + "grad_norm": 2.7046239376068115, + "learning_rate": 1.8496100374141924e-05, + "loss": 1.2255, + "step": 742 + }, + { + "epoch": 0.5623462630085146, + "grad_norm": 2.5250003337860107, + "learning_rate": 1.849186348848221e-05, + "loss": 1.2028, + "step": 743 + }, + { + "epoch": 0.5631031220435194, + "grad_norm": 2.423783779144287, + "learning_rate": 1.848762112974138e-05, + "loss": 1.2485, + "step": 744 + }, + { + "epoch": 0.5638599810785241, + "grad_norm": 2.3143739700317383, + "learning_rate": 1.8483373300653995e-05, + "loss": 1.2238, + "step": 745 + }, + { + "epoch": 0.5646168401135289, + "grad_norm": 2.433070421218872, + "learning_rate": 1.8479120003958136e-05, + "loss": 1.2496, + "step": 746 + }, + { + "epoch": 0.5653736991485336, + "grad_norm": 2.5320703983306885, + "learning_rate": 1.8474861242395424e-05, + "loss": 1.2477, + "step": 747 + }, + { + "epoch": 0.5661305581835383, + "grad_norm": 5.566840171813965, + "learning_rate": 1.8470597018710976e-05, + "loss": 1.2086, + "step": 748 + }, + { + "epoch": 0.5668874172185431, + "grad_norm": 2.5624606609344482, + "learning_rate": 1.8466327335653458e-05, + "loss": 1.2458, + "step": 749 + }, + { + "epoch": 0.5676442762535477, + "grad_norm": 3.781528949737549, + "learning_rate": 1.846205219597504e-05, + "loss": 1.259, + "step": 750 + }, + { + "epoch": 0.5684011352885525, + "grad_norm": 2.4453654289245605, + "learning_rate": 1.8457771602431406e-05, + "loss": 1.2511, + "step": 751 + }, + { + "epoch": 0.5691579943235573, + "grad_norm": 2.4234702587127686, + "learning_rate": 1.8453485557781768e-05, + "loss": 1.2339, + "step": 752 + }, + { + "epoch": 0.569914853358562, + "grad_norm": 2.637007236480713, + "learning_rate": 1.8449194064788845e-05, + "loss": 1.2274, + "step": 753 + }, + { + "epoch": 0.5706717123935667, + "grad_norm": 2.557408332824707, + "learning_rate": 1.8444897126218865e-05, + "loss": 1.2718, + "step": 754 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 2.3460357189178467, + "learning_rate": 1.8440594744841564e-05, + "loss": 1.2522, + "step": 755 + }, + { + "epoch": 0.5721854304635762, + "grad_norm": 2.9702370166778564, + "learning_rate": 1.84362869234302e-05, + "loss": 1.2365, + "step": 756 + }, + { + "epoch": 0.5729422894985808, + "grad_norm": 2.4645347595214844, + "learning_rate": 1.843197366476153e-05, + "loss": 1.2497, + "step": 757 + }, + { + "epoch": 0.5736991485335856, + "grad_norm": 2.525984764099121, + "learning_rate": 1.8427654971615804e-05, + "loss": 1.2472, + "step": 758 + }, + { + "epoch": 0.5744560075685904, + "grad_norm": 2.598914861679077, + "learning_rate": 1.8423330846776797e-05, + "loss": 1.2783, + "step": 759 + }, + { + "epoch": 0.575212866603595, + "grad_norm": 2.464893341064453, + "learning_rate": 1.841900129303177e-05, + "loss": 1.2331, + "step": 760 + }, + { + "epoch": 0.5759697256385998, + "grad_norm": 2.517779588699341, + "learning_rate": 1.8414666313171488e-05, + "loss": 1.2087, + "step": 761 + }, + { + "epoch": 0.5767265846736045, + "grad_norm": 2.3364832401275635, + "learning_rate": 1.8410325909990207e-05, + "loss": 1.251, + "step": 762 + }, + { + "epoch": 0.5774834437086093, + "grad_norm": 2.348635673522949, + "learning_rate": 1.8405980086285693e-05, + "loss": 1.2424, + "step": 763 + }, + { + "epoch": 0.578240302743614, + "grad_norm": 2.472801446914673, + "learning_rate": 1.8401628844859193e-05, + "loss": 1.1972, + "step": 764 + }, + { + "epoch": 0.5789971617786187, + "grad_norm": 2.528832197189331, + "learning_rate": 1.839727218851545e-05, + "loss": 1.2904, + "step": 765 + }, + { + "epoch": 0.5797540208136235, + "grad_norm": 2.833585262298584, + "learning_rate": 1.83929101200627e-05, + "loss": 1.2284, + "step": 766 + }, + { + "epoch": 0.5805108798486281, + "grad_norm": 2.886864185333252, + "learning_rate": 1.838854264231267e-05, + "loss": 1.2529, + "step": 767 + }, + { + "epoch": 0.5812677388836329, + "grad_norm": 2.6184258460998535, + "learning_rate": 1.8384169758080564e-05, + "loss": 1.2422, + "step": 768 + }, + { + "epoch": 0.5820245979186377, + "grad_norm": 2.59594988822937, + "learning_rate": 1.8379791470185077e-05, + "loss": 1.2349, + "step": 769 + }, + { + "epoch": 0.5827814569536424, + "grad_norm": 2.359560489654541, + "learning_rate": 1.837540778144839e-05, + "loss": 1.2146, + "step": 770 + }, + { + "epoch": 0.5835383159886471, + "grad_norm": 3.088444709777832, + "learning_rate": 1.8371018694696155e-05, + "loss": 1.2667, + "step": 771 + }, + { + "epoch": 0.5842951750236518, + "grad_norm": 2.766091823577881, + "learning_rate": 1.836662421275752e-05, + "loss": 1.218, + "step": 772 + }, + { + "epoch": 0.5850520340586566, + "grad_norm": 2.739274263381958, + "learning_rate": 1.8362224338465093e-05, + "loss": 1.2618, + "step": 773 + }, + { + "epoch": 0.5858088930936614, + "grad_norm": 4.742860794067383, + "learning_rate": 1.835781907465497e-05, + "loss": 1.2989, + "step": 774 + }, + { + "epoch": 0.586565752128666, + "grad_norm": 3.0373001098632812, + "learning_rate": 1.8353408424166712e-05, + "loss": 1.283, + "step": 775 + }, + { + "epoch": 0.5873226111636708, + "grad_norm": 2.5657973289489746, + "learning_rate": 1.8348992389843365e-05, + "loss": 1.1942, + "step": 776 + }, + { + "epoch": 0.5880794701986755, + "grad_norm": 2.7128591537475586, + "learning_rate": 1.834457097453143e-05, + "loss": 1.262, + "step": 777 + }, + { + "epoch": 0.5888363292336802, + "grad_norm": 2.4917023181915283, + "learning_rate": 1.834014418108089e-05, + "loss": 1.2194, + "step": 778 + }, + { + "epoch": 0.589593188268685, + "grad_norm": 2.7309277057647705, + "learning_rate": 1.8335712012345188e-05, + "loss": 1.231, + "step": 779 + }, + { + "epoch": 0.5903500473036897, + "grad_norm": 2.894216537475586, + "learning_rate": 1.8331274471181224e-05, + "loss": 1.234, + "step": 780 + }, + { + "epoch": 0.5911069063386944, + "grad_norm": 2.491863250732422, + "learning_rate": 1.8326831560449375e-05, + "loss": 1.2335, + "step": 781 + }, + { + "epoch": 0.5918637653736991, + "grad_norm": 2.636247396469116, + "learning_rate": 1.832238328301348e-05, + "loss": 1.2371, + "step": 782 + }, + { + "epoch": 0.5926206244087039, + "grad_norm": 2.6232643127441406, + "learning_rate": 1.831792964174082e-05, + "loss": 1.2626, + "step": 783 + }, + { + "epoch": 0.5933774834437087, + "grad_norm": 2.667076826095581, + "learning_rate": 1.8313470639502148e-05, + "loss": 1.257, + "step": 784 + }, + { + "epoch": 0.5941343424787133, + "grad_norm": 2.936359405517578, + "learning_rate": 1.8309006279171675e-05, + "loss": 1.2509, + "step": 785 + }, + { + "epoch": 0.5948912015137181, + "grad_norm": 2.522406578063965, + "learning_rate": 1.8304536563627052e-05, + "loss": 1.2804, + "step": 786 + }, + { + "epoch": 0.5956480605487228, + "grad_norm": 2.542407512664795, + "learning_rate": 1.830006149574939e-05, + "loss": 1.2393, + "step": 787 + }, + { + "epoch": 0.5964049195837275, + "grad_norm": 2.5919876098632812, + "learning_rate": 1.8295581078423253e-05, + "loss": 1.2622, + "step": 788 + }, + { + "epoch": 0.5971617786187322, + "grad_norm": 2.7095932960510254, + "learning_rate": 1.8291095314536647e-05, + "loss": 1.2491, + "step": 789 + }, + { + "epoch": 0.597918637653737, + "grad_norm": 2.4110512733459473, + "learning_rate": 1.8286604206981028e-05, + "loss": 1.2622, + "step": 790 + }, + { + "epoch": 0.5986754966887418, + "grad_norm": 2.7041079998016357, + "learning_rate": 1.8282107758651295e-05, + "loss": 1.2563, + "step": 791 + }, + { + "epoch": 0.5994323557237464, + "grad_norm": 2.7525973320007324, + "learning_rate": 1.827760597244579e-05, + "loss": 1.2449, + "step": 792 + }, + { + "epoch": 0.6001892147587512, + "grad_norm": 2.612968921661377, + "learning_rate": 1.8273098851266297e-05, + "loss": 1.258, + "step": 793 + }, + { + "epoch": 0.6009460737937559, + "grad_norm": 2.6070921421051025, + "learning_rate": 1.826858639801804e-05, + "loss": 1.3045, + "step": 794 + }, + { + "epoch": 0.6017029328287606, + "grad_norm": 2.4890692234039307, + "learning_rate": 1.8264068615609668e-05, + "loss": 1.2253, + "step": 795 + }, + { + "epoch": 0.6024597918637654, + "grad_norm": 2.9760918617248535, + "learning_rate": 1.8259545506953285e-05, + "loss": 1.2673, + "step": 796 + }, + { + "epoch": 0.6032166508987701, + "grad_norm": 2.8577773571014404, + "learning_rate": 1.825501707496441e-05, + "loss": 1.264, + "step": 797 + }, + { + "epoch": 0.6039735099337749, + "grad_norm": 2.549546718597412, + "learning_rate": 1.825048332256201e-05, + "loss": 1.2228, + "step": 798 + }, + { + "epoch": 0.6047303689687795, + "grad_norm": 2.7687017917633057, + "learning_rate": 1.8245944252668462e-05, + "loss": 1.2522, + "step": 799 + }, + { + "epoch": 0.6054872280037843, + "grad_norm": 2.533287763595581, + "learning_rate": 1.824139986820959e-05, + "loss": 1.1939, + "step": 800 + }, + { + "epoch": 0.6062440870387891, + "grad_norm": 2.6402809619903564, + "learning_rate": 1.8236850172114633e-05, + "loss": 1.2417, + "step": 801 + }, + { + "epoch": 0.6070009460737937, + "grad_norm": 2.592946767807007, + "learning_rate": 1.8232295167316252e-05, + "loss": 1.2922, + "step": 802 + }, + { + "epoch": 0.6077578051087985, + "grad_norm": 2.6012048721313477, + "learning_rate": 1.8227734856750537e-05, + "loss": 1.2658, + "step": 803 + }, + { + "epoch": 0.6085146641438032, + "grad_norm": 2.737257242202759, + "learning_rate": 1.8223169243356995e-05, + "loss": 1.2955, + "step": 804 + }, + { + "epoch": 0.609271523178808, + "grad_norm": 2.8576440811157227, + "learning_rate": 1.8218598330078548e-05, + "loss": 1.261, + "step": 805 + }, + { + "epoch": 0.6100283822138127, + "grad_norm": 4.944385051727295, + "learning_rate": 1.8214022119861537e-05, + "loss": 1.2438, + "step": 806 + }, + { + "epoch": 0.6107852412488174, + "grad_norm": 2.8472225666046143, + "learning_rate": 1.820944061565572e-05, + "loss": 1.2305, + "step": 807 + }, + { + "epoch": 0.6115421002838222, + "grad_norm": 2.8943638801574707, + "learning_rate": 1.8204853820414267e-05, + "loss": 1.2608, + "step": 808 + }, + { + "epoch": 0.6122989593188268, + "grad_norm": 2.523142099380493, + "learning_rate": 1.820026173709375e-05, + "loss": 1.2721, + "step": 809 + }, + { + "epoch": 0.6130558183538316, + "grad_norm": 2.8089590072631836, + "learning_rate": 1.8195664368654157e-05, + "loss": 1.222, + "step": 810 + }, + { + "epoch": 0.6138126773888364, + "grad_norm": 2.9274590015411377, + "learning_rate": 1.8191061718058885e-05, + "loss": 1.2534, + "step": 811 + }, + { + "epoch": 0.614569536423841, + "grad_norm": 2.6819167137145996, + "learning_rate": 1.818645378827473e-05, + "loss": 1.2566, + "step": 812 + }, + { + "epoch": 0.6153263954588458, + "grad_norm": 2.5687010288238525, + "learning_rate": 1.8181840582271897e-05, + "loss": 1.2323, + "step": 813 + }, + { + "epoch": 0.6160832544938505, + "grad_norm": 2.636622428894043, + "learning_rate": 1.8177222103023983e-05, + "loss": 1.2007, + "step": 814 + }, + { + "epoch": 0.6168401135288553, + "grad_norm": 2.5585618019104004, + "learning_rate": 1.8172598353507988e-05, + "loss": 1.2169, + "step": 815 + }, + { + "epoch": 0.61759697256386, + "grad_norm": 2.880889415740967, + "learning_rate": 1.8167969336704322e-05, + "loss": 1.2211, + "step": 816 + }, + { + "epoch": 0.6183538315988647, + "grad_norm": 2.575530767440796, + "learning_rate": 1.8163335055596764e-05, + "loss": 1.2165, + "step": 817 + }, + { + "epoch": 0.6191106906338695, + "grad_norm": 2.65857195854187, + "learning_rate": 1.815869551317251e-05, + "loss": 1.2527, + "step": 818 + }, + { + "epoch": 0.6198675496688741, + "grad_norm": 2.7308692932128906, + "learning_rate": 1.8154050712422135e-05, + "loss": 1.245, + "step": 819 + }, + { + "epoch": 0.6206244087038789, + "grad_norm": 2.4128143787384033, + "learning_rate": 1.8149400656339606e-05, + "loss": 1.2274, + "step": 820 + }, + { + "epoch": 0.6213812677388836, + "grad_norm": 2.678269386291504, + "learning_rate": 1.8144745347922282e-05, + "loss": 1.2348, + "step": 821 + }, + { + "epoch": 0.6221381267738884, + "grad_norm": 2.4970011711120605, + "learning_rate": 1.81400847901709e-05, + "loss": 1.2525, + "step": 822 + }, + { + "epoch": 0.6228949858088931, + "grad_norm": 3.0284082889556885, + "learning_rate": 1.813541898608959e-05, + "loss": 1.2283, + "step": 823 + }, + { + "epoch": 0.6236518448438978, + "grad_norm": 2.5325472354888916, + "learning_rate": 1.813074793868585e-05, + "loss": 1.2177, + "step": 824 + }, + { + "epoch": 0.6244087038789026, + "grad_norm": 2.8422694206237793, + "learning_rate": 1.8126071650970566e-05, + "loss": 1.1957, + "step": 825 + }, + { + "epoch": 0.6251655629139072, + "grad_norm": 2.7805769443511963, + "learning_rate": 1.8121390125958012e-05, + "loss": 1.2406, + "step": 826 + }, + { + "epoch": 0.625922421948912, + "grad_norm": 3.035707473754883, + "learning_rate": 1.811670336666582e-05, + "loss": 1.2217, + "step": 827 + }, + { + "epoch": 0.6266792809839168, + "grad_norm": 2.6617417335510254, + "learning_rate": 1.8112011376115004e-05, + "loss": 1.2489, + "step": 828 + }, + { + "epoch": 0.6274361400189215, + "grad_norm": 2.593369722366333, + "learning_rate": 1.8107314157329953e-05, + "loss": 1.2582, + "step": 829 + }, + { + "epoch": 0.6281929990539262, + "grad_norm": 2.33566951751709, + "learning_rate": 1.810261171333842e-05, + "loss": 1.1726, + "step": 830 + }, + { + "epoch": 0.6289498580889309, + "grad_norm": 2.6399929523468018, + "learning_rate": 1.8097904047171525e-05, + "loss": 1.233, + "step": 831 + }, + { + "epoch": 0.6297067171239357, + "grad_norm": 2.833388328552246, + "learning_rate": 1.8093191161863765e-05, + "loss": 1.2465, + "step": 832 + }, + { + "epoch": 0.6304635761589404, + "grad_norm": 2.5618953704833984, + "learning_rate": 1.808847306045299e-05, + "loss": 1.2786, + "step": 833 + }, + { + "epoch": 0.6312204351939451, + "grad_norm": 2.449512004852295, + "learning_rate": 1.8083749745980417e-05, + "loss": 1.1866, + "step": 834 + }, + { + "epoch": 0.6319772942289499, + "grad_norm": 2.3261687755584717, + "learning_rate": 1.8079021221490623e-05, + "loss": 1.2293, + "step": 835 + }, + { + "epoch": 0.6327341532639545, + "grad_norm": 2.2670247554779053, + "learning_rate": 1.8074287490031544e-05, + "loss": 1.2307, + "step": 836 + }, + { + "epoch": 0.6334910122989593, + "grad_norm": 2.9090189933776855, + "learning_rate": 1.8069548554654465e-05, + "loss": 1.23, + "step": 837 + }, + { + "epoch": 0.6342478713339641, + "grad_norm": 2.2023513317108154, + "learning_rate": 1.8064804418414036e-05, + "loss": 1.2559, + "step": 838 + }, + { + "epoch": 0.6350047303689688, + "grad_norm": 2.3907856941223145, + "learning_rate": 1.8060055084368256e-05, + "loss": 1.1783, + "step": 839 + }, + { + "epoch": 0.6357615894039735, + "grad_norm": 2.7036445140838623, + "learning_rate": 1.805530055557847e-05, + "loss": 1.2268, + "step": 840 + }, + { + "epoch": 0.6365184484389782, + "grad_norm": 2.429286003112793, + "learning_rate": 1.805054083510938e-05, + "loss": 1.1904, + "step": 841 + }, + { + "epoch": 0.637275307473983, + "grad_norm": 2.644791603088379, + "learning_rate": 1.804577592602902e-05, + "loss": 1.1866, + "step": 842 + }, + { + "epoch": 0.6380321665089878, + "grad_norm": 2.7880802154541016, + "learning_rate": 1.804100583140879e-05, + "loss": 1.2817, + "step": 843 + }, + { + "epoch": 0.6387890255439924, + "grad_norm": 2.485358476638794, + "learning_rate": 1.8036230554323413e-05, + "loss": 1.281, + "step": 844 + }, + { + "epoch": 0.6395458845789972, + "grad_norm": 2.5849761962890625, + "learning_rate": 1.803145009785096e-05, + "loss": 1.248, + "step": 845 + }, + { + "epoch": 0.6403027436140019, + "grad_norm": 2.357409715652466, + "learning_rate": 1.8026664465072838e-05, + "loss": 1.2828, + "step": 846 + }, + { + "epoch": 0.6410596026490066, + "grad_norm": 2.4510414600372314, + "learning_rate": 1.80218736590738e-05, + "loss": 1.2275, + "step": 847 + }, + { + "epoch": 0.6418164616840114, + "grad_norm": 2.625035524368286, + "learning_rate": 1.8017077682941918e-05, + "loss": 1.2369, + "step": 848 + }, + { + "epoch": 0.6425733207190161, + "grad_norm": 2.4510104656219482, + "learning_rate": 1.8012276539768613e-05, + "loss": 1.2624, + "step": 849 + }, + { + "epoch": 0.6433301797540208, + "grad_norm": 2.6468582153320312, + "learning_rate": 1.800747023264862e-05, + "loss": 1.2964, + "step": 850 + }, + { + "epoch": 0.6440870387890255, + "grad_norm": 2.45991587638855, + "learning_rate": 1.800265876468002e-05, + "loss": 1.2359, + "step": 851 + }, + { + "epoch": 0.6448438978240303, + "grad_norm": 2.546734571456909, + "learning_rate": 1.799784213896421e-05, + "loss": 1.2124, + "step": 852 + }, + { + "epoch": 0.645600756859035, + "grad_norm": 2.265397787094116, + "learning_rate": 1.799302035860591e-05, + "loss": 1.1945, + "step": 853 + }, + { + "epoch": 0.6463576158940397, + "grad_norm": 2.4162395000457764, + "learning_rate": 1.7988193426713165e-05, + "loss": 1.2115, + "step": 854 + }, + { + "epoch": 0.6471144749290445, + "grad_norm": 2.2301483154296875, + "learning_rate": 1.7983361346397347e-05, + "loss": 1.2699, + "step": 855 + }, + { + "epoch": 0.6478713339640492, + "grad_norm": 2.2673699855804443, + "learning_rate": 1.797852412077314e-05, + "loss": 1.2525, + "step": 856 + }, + { + "epoch": 0.6486281929990539, + "grad_norm": 2.5041098594665527, + "learning_rate": 1.7973681752958543e-05, + "loss": 1.231, + "step": 857 + }, + { + "epoch": 0.6493850520340586, + "grad_norm": 2.5438284873962402, + "learning_rate": 1.7968834246074875e-05, + "loss": 1.2316, + "step": 858 + }, + { + "epoch": 0.6501419110690634, + "grad_norm": 2.4436419010162354, + "learning_rate": 1.7963981603246762e-05, + "loss": 1.2461, + "step": 859 + }, + { + "epoch": 0.6508987701040682, + "grad_norm": 2.3260018825531006, + "learning_rate": 1.795912382760215e-05, + "loss": 1.2575, + "step": 860 + }, + { + "epoch": 0.6516556291390728, + "grad_norm": 2.527569532394409, + "learning_rate": 1.7954260922272278e-05, + "loss": 1.2552, + "step": 861 + }, + { + "epoch": 0.6524124881740776, + "grad_norm": 2.5068411827087402, + "learning_rate": 1.7949392890391706e-05, + "loss": 1.2439, + "step": 862 + }, + { + "epoch": 0.6531693472090823, + "grad_norm": 2.8131117820739746, + "learning_rate": 1.7944519735098295e-05, + "loss": 1.2669, + "step": 863 + }, + { + "epoch": 0.653926206244087, + "grad_norm": 2.368083953857422, + "learning_rate": 1.79396414595332e-05, + "loss": 1.273, + "step": 864 + }, + { + "epoch": 0.6546830652790918, + "grad_norm": 2.4757819175720215, + "learning_rate": 1.7934758066840893e-05, + "loss": 1.2652, + "step": 865 + }, + { + "epoch": 0.6554399243140965, + "grad_norm": 2.7727437019348145, + "learning_rate": 1.7929869560169123e-05, + "loss": 1.2661, + "step": 866 + }, + { + "epoch": 0.6561967833491013, + "grad_norm": 2.5417017936706543, + "learning_rate": 1.7924975942668954e-05, + "loss": 1.2624, + "step": 867 + }, + { + "epoch": 0.6569536423841059, + "grad_norm": 3.0404696464538574, + "learning_rate": 1.792007721749474e-05, + "loss": 1.2149, + "step": 868 + }, + { + "epoch": 0.6577105014191107, + "grad_norm": 2.528648853302002, + "learning_rate": 1.7915173387804115e-05, + "loss": 1.2536, + "step": 869 + }, + { + "epoch": 0.6584673604541155, + "grad_norm": 2.5994584560394287, + "learning_rate": 1.791026445675802e-05, + "loss": 1.2146, + "step": 870 + }, + { + "epoch": 0.6592242194891201, + "grad_norm": 2.523890495300293, + "learning_rate": 1.7905350427520672e-05, + "loss": 1.2599, + "step": 871 + }, + { + "epoch": 0.6599810785241249, + "grad_norm": 3.055417537689209, + "learning_rate": 1.7900431303259585e-05, + "loss": 1.2447, + "step": 872 + }, + { + "epoch": 0.6607379375591296, + "grad_norm": 2.5144965648651123, + "learning_rate": 1.789550708714555e-05, + "loss": 1.2022, + "step": 873 + }, + { + "epoch": 0.6614947965941343, + "grad_norm": 2.5344860553741455, + "learning_rate": 1.789057778235264e-05, + "loss": 1.2578, + "step": 874 + }, + { + "epoch": 0.6622516556291391, + "grad_norm": 2.7370986938476562, + "learning_rate": 1.7885643392058207e-05, + "loss": 1.1964, + "step": 875 + }, + { + "epoch": 0.6630085146641438, + "grad_norm": 2.831005573272705, + "learning_rate": 1.7880703919442885e-05, + "loss": 1.2218, + "step": 876 + }, + { + "epoch": 0.6637653736991486, + "grad_norm": 2.2660505771636963, + "learning_rate": 1.787575936769059e-05, + "loss": 1.232, + "step": 877 + }, + { + "epoch": 0.6645222327341532, + "grad_norm": 2.5577943325042725, + "learning_rate": 1.78708097399885e-05, + "loss": 1.289, + "step": 878 + }, + { + "epoch": 0.665279091769158, + "grad_norm": 2.8066608905792236, + "learning_rate": 1.786585503952707e-05, + "loss": 1.2051, + "step": 879 + }, + { + "epoch": 0.6660359508041628, + "grad_norm": 2.683680295944214, + "learning_rate": 1.786089526950002e-05, + "loss": 1.2343, + "step": 880 + }, + { + "epoch": 0.6667928098391674, + "grad_norm": 2.571253538131714, + "learning_rate": 1.785593043310434e-05, + "loss": 1.2279, + "step": 881 + }, + { + "epoch": 0.6675496688741722, + "grad_norm": 2.2818214893341064, + "learning_rate": 1.78509605335403e-05, + "loss": 1.2346, + "step": 882 + }, + { + "epoch": 0.6683065279091769, + "grad_norm": 2.427520513534546, + "learning_rate": 1.7845985574011413e-05, + "loss": 1.251, + "step": 883 + }, + { + "epoch": 0.6690633869441817, + "grad_norm": 2.615901231765747, + "learning_rate": 1.784100555772446e-05, + "loss": 1.2697, + "step": 884 + }, + { + "epoch": 0.6698202459791863, + "grad_norm": 2.3778128623962402, + "learning_rate": 1.7836020487889495e-05, + "loss": 1.2291, + "step": 885 + }, + { + "epoch": 0.6705771050141911, + "grad_norm": 2.4669504165649414, + "learning_rate": 1.7831030367719802e-05, + "loss": 1.2365, + "step": 886 + }, + { + "epoch": 0.6713339640491959, + "grad_norm": 2.397721290588379, + "learning_rate": 1.782603520043195e-05, + "loss": 1.2718, + "step": 887 + }, + { + "epoch": 0.6720908230842005, + "grad_norm": 2.323598623275757, + "learning_rate": 1.782103498924574e-05, + "loss": 1.2706, + "step": 888 + }, + { + "epoch": 0.6728476821192053, + "grad_norm": 2.592615842819214, + "learning_rate": 1.7816029737384234e-05, + "loss": 1.2821, + "step": 889 + }, + { + "epoch": 0.67360454115421, + "grad_norm": 2.552388906478882, + "learning_rate": 1.7811019448073742e-05, + "loss": 1.2075, + "step": 890 + }, + { + "epoch": 0.6743614001892148, + "grad_norm": 2.659424304962158, + "learning_rate": 1.7806004124543818e-05, + "loss": 1.2365, + "step": 891 + }, + { + "epoch": 0.6751182592242195, + "grad_norm": 2.596625328063965, + "learning_rate": 1.7800983770027266e-05, + "loss": 1.2685, + "step": 892 + }, + { + "epoch": 0.6758751182592242, + "grad_norm": 2.485259771347046, + "learning_rate": 1.779595838776013e-05, + "loss": 1.2453, + "step": 893 + }, + { + "epoch": 0.676631977294229, + "grad_norm": 2.3858642578125, + "learning_rate": 1.7790927980981687e-05, + "loss": 1.1896, + "step": 894 + }, + { + "epoch": 0.6773888363292336, + "grad_norm": 2.53601336479187, + "learning_rate": 1.7785892552934468e-05, + "loss": 1.2533, + "step": 895 + }, + { + "epoch": 0.6781456953642384, + "grad_norm": 2.7505080699920654, + "learning_rate": 1.778085210686423e-05, + "loss": 1.2449, + "step": 896 + }, + { + "epoch": 0.6789025543992432, + "grad_norm": 2.4080655574798584, + "learning_rate": 1.7775806646019974e-05, + "loss": 1.1985, + "step": 897 + }, + { + "epoch": 0.6796594134342478, + "grad_norm": 2.742640972137451, + "learning_rate": 1.7770756173653923e-05, + "loss": 1.2434, + "step": 898 + }, + { + "epoch": 0.6804162724692526, + "grad_norm": 2.377990484237671, + "learning_rate": 1.776570069302153e-05, + "loss": 1.1726, + "step": 899 + }, + { + "epoch": 0.6811731315042573, + "grad_norm": 2.35687518119812, + "learning_rate": 1.7760640207381486e-05, + "loss": 1.2189, + "step": 900 + }, + { + "epoch": 0.6819299905392621, + "grad_norm": 2.576018810272217, + "learning_rate": 1.77555747199957e-05, + "loss": 1.2318, + "step": 901 + }, + { + "epoch": 0.6826868495742668, + "grad_norm": 2.3314318656921387, + "learning_rate": 1.7750504234129312e-05, + "loss": 1.1889, + "step": 902 + }, + { + "epoch": 0.6834437086092715, + "grad_norm": 2.3357717990875244, + "learning_rate": 1.7745428753050675e-05, + "loss": 1.2168, + "step": 903 + }, + { + "epoch": 0.6842005676442763, + "grad_norm": 2.2540555000305176, + "learning_rate": 1.774034828003137e-05, + "loss": 1.2017, + "step": 904 + }, + { + "epoch": 0.684957426679281, + "grad_norm": 2.325144052505493, + "learning_rate": 1.773526281834619e-05, + "loss": 1.2409, + "step": 905 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 2.731501340866089, + "learning_rate": 1.7730172371273147e-05, + "loss": 1.2765, + "step": 906 + }, + { + "epoch": 0.6864711447492905, + "grad_norm": 2.3535265922546387, + "learning_rate": 1.7725076942093468e-05, + "loss": 1.2353, + "step": 907 + }, + { + "epoch": 0.6872280037842952, + "grad_norm": 2.852663040161133, + "learning_rate": 1.7719976534091584e-05, + "loss": 1.2761, + "step": 908 + }, + { + "epoch": 0.6879848628192999, + "grad_norm": 2.5675928592681885, + "learning_rate": 1.7714871150555146e-05, + "loss": 1.1906, + "step": 909 + }, + { + "epoch": 0.6887417218543046, + "grad_norm": 2.1720049381256104, + "learning_rate": 1.7709760794775e-05, + "loss": 1.2057, + "step": 910 + }, + { + "epoch": 0.6894985808893094, + "grad_norm": 2.567373514175415, + "learning_rate": 1.7704645470045213e-05, + "loss": 1.2365, + "step": 911 + }, + { + "epoch": 0.6902554399243142, + "grad_norm": 2.38577938079834, + "learning_rate": 1.7699525179663034e-05, + "loss": 1.2047, + "step": 912 + }, + { + "epoch": 0.6910122989593188, + "grad_norm": 2.3595142364501953, + "learning_rate": 1.7694399926928932e-05, + "loss": 1.2329, + "step": 913 + }, + { + "epoch": 0.6917691579943236, + "grad_norm": 2.7524566650390625, + "learning_rate": 1.7689269715146562e-05, + "loss": 1.2461, + "step": 914 + }, + { + "epoch": 0.6925260170293283, + "grad_norm": 2.2120566368103027, + "learning_rate": 1.768413454762278e-05, + "loss": 1.2232, + "step": 915 + }, + { + "epoch": 0.693282876064333, + "grad_norm": 2.491506338119507, + "learning_rate": 1.767899442766764e-05, + "loss": 1.2185, + "step": 916 + }, + { + "epoch": 0.6940397350993377, + "grad_norm": 2.299386501312256, + "learning_rate": 1.7673849358594387e-05, + "loss": 1.2146, + "step": 917 + }, + { + "epoch": 0.6947965941343425, + "grad_norm": 2.367396831512451, + "learning_rate": 1.766869934371945e-05, + "loss": 1.2666, + "step": 918 + }, + { + "epoch": 0.6955534531693472, + "grad_norm": 2.379352331161499, + "learning_rate": 1.766354438636245e-05, + "loss": 1.2295, + "step": 919 + }, + { + "epoch": 0.6963103122043519, + "grad_norm": 2.91322660446167, + "learning_rate": 1.7658384489846197e-05, + "loss": 1.2211, + "step": 920 + }, + { + "epoch": 0.6970671712393567, + "grad_norm": 2.3727736473083496, + "learning_rate": 1.7653219657496675e-05, + "loss": 1.2478, + "step": 921 + }, + { + "epoch": 0.6978240302743614, + "grad_norm": 2.3029327392578125, + "learning_rate": 1.7648049892643064e-05, + "loss": 1.238, + "step": 922 + }, + { + "epoch": 0.6985808893093661, + "grad_norm": 2.3356475830078125, + "learning_rate": 1.7642875198617715e-05, + "loss": 1.1932, + "step": 923 + }, + { + "epoch": 0.6993377483443709, + "grad_norm": 2.5331709384918213, + "learning_rate": 1.7637695578756148e-05, + "loss": 1.1822, + "step": 924 + }, + { + "epoch": 0.7000946073793756, + "grad_norm": 2.714674949645996, + "learning_rate": 1.7632511036397078e-05, + "loss": 1.2454, + "step": 925 + }, + { + "epoch": 0.7008514664143803, + "grad_norm": 2.497758388519287, + "learning_rate": 1.7627321574882373e-05, + "loss": 1.2552, + "step": 926 + }, + { + "epoch": 0.701608325449385, + "grad_norm": 2.6237785816192627, + "learning_rate": 1.7622127197557085e-05, + "loss": 1.2334, + "step": 927 + }, + { + "epoch": 0.7023651844843898, + "grad_norm": 2.4308512210845947, + "learning_rate": 1.7616927907769436e-05, + "loss": 1.2516, + "step": 928 + }, + { + "epoch": 0.7031220435193946, + "grad_norm": 2.1913723945617676, + "learning_rate": 1.7611723708870797e-05, + "loss": 1.267, + "step": 929 + }, + { + "epoch": 0.7038789025543992, + "grad_norm": 2.6569485664367676, + "learning_rate": 1.7606514604215723e-05, + "loss": 1.2301, + "step": 930 + }, + { + "epoch": 0.704635761589404, + "grad_norm": 2.4195547103881836, + "learning_rate": 1.7601300597161918e-05, + "loss": 1.2464, + "step": 931 + }, + { + "epoch": 0.7053926206244087, + "grad_norm": 2.580186128616333, + "learning_rate": 1.7596081691070262e-05, + "loss": 1.2432, + "step": 932 + }, + { + "epoch": 0.7061494796594134, + "grad_norm": 2.4679551124572754, + "learning_rate": 1.759085788930477e-05, + "loss": 1.23, + "step": 933 + }, + { + "epoch": 0.7069063386944182, + "grad_norm": 2.256150722503662, + "learning_rate": 1.7585629195232633e-05, + "loss": 1.2362, + "step": 934 + }, + { + "epoch": 0.7076631977294229, + "grad_norm": 2.7825927734375, + "learning_rate": 1.7580395612224184e-05, + "loss": 1.2542, + "step": 935 + }, + { + "epoch": 0.7084200567644277, + "grad_norm": 2.568265676498413, + "learning_rate": 1.757515714365291e-05, + "loss": 1.3004, + "step": 936 + }, + { + "epoch": 0.7091769157994323, + "grad_norm": 2.422884702682495, + "learning_rate": 1.7569913792895455e-05, + "loss": 1.2382, + "step": 937 + }, + { + "epoch": 0.7099337748344371, + "grad_norm": 2.4318430423736572, + "learning_rate": 1.7564665563331597e-05, + "loss": 1.2421, + "step": 938 + }, + { + "epoch": 0.7106906338694419, + "grad_norm": 2.4188950061798096, + "learning_rate": 1.755941245834426e-05, + "loss": 1.244, + "step": 939 + }, + { + "epoch": 0.7114474929044465, + "grad_norm": 2.37963604927063, + "learning_rate": 1.7554154481319523e-05, + "loss": 1.2137, + "step": 940 + }, + { + "epoch": 0.7122043519394513, + "grad_norm": 2.5548665523529053, + "learning_rate": 1.7548891635646595e-05, + "loss": 1.241, + "step": 941 + }, + { + "epoch": 0.712961210974456, + "grad_norm": 2.384345054626465, + "learning_rate": 1.7543623924717827e-05, + "loss": 1.2299, + "step": 942 + }, + { + "epoch": 0.7137180700094607, + "grad_norm": 2.459399461746216, + "learning_rate": 1.7538351351928705e-05, + "loss": 1.2242, + "step": 943 + }, + { + "epoch": 0.7144749290444655, + "grad_norm": 2.2715373039245605, + "learning_rate": 1.7533073920677847e-05, + "loss": 1.2264, + "step": 944 + }, + { + "epoch": 0.7152317880794702, + "grad_norm": 2.423783302307129, + "learning_rate": 1.752779163436701e-05, + "loss": 1.2061, + "step": 945 + }, + { + "epoch": 0.715988647114475, + "grad_norm": 2.368046283721924, + "learning_rate": 1.7522504496401068e-05, + "loss": 1.2568, + "step": 946 + }, + { + "epoch": 0.7167455061494796, + "grad_norm": 2.7439255714416504, + "learning_rate": 1.7517212510188034e-05, + "loss": 1.2123, + "step": 947 + }, + { + "epoch": 0.7175023651844844, + "grad_norm": 2.3615167140960693, + "learning_rate": 1.751191567913904e-05, + "loss": 1.2188, + "step": 948 + }, + { + "epoch": 0.7182592242194891, + "grad_norm": 2.394190549850464, + "learning_rate": 1.7506614006668346e-05, + "loss": 1.21, + "step": 949 + }, + { + "epoch": 0.7190160832544938, + "grad_norm": 2.2254350185394287, + "learning_rate": 1.7501307496193324e-05, + "loss": 1.2306, + "step": 950 + }, + { + "epoch": 0.7197729422894986, + "grad_norm": 2.734381675720215, + "learning_rate": 1.749599615113447e-05, + "loss": 1.2367, + "step": 951 + }, + { + "epoch": 0.7205298013245033, + "grad_norm": 2.4236867427825928, + "learning_rate": 1.7490679974915404e-05, + "loss": 1.2013, + "step": 952 + }, + { + "epoch": 0.7212866603595081, + "grad_norm": 2.4105286598205566, + "learning_rate": 1.748535897096284e-05, + "loss": 1.1849, + "step": 953 + }, + { + "epoch": 0.7220435193945127, + "grad_norm": 3.396277666091919, + "learning_rate": 1.7480033142706626e-05, + "loss": 1.2018, + "step": 954 + }, + { + "epoch": 0.7228003784295175, + "grad_norm": 2.49308180809021, + "learning_rate": 1.7474702493579704e-05, + "loss": 1.2533, + "step": 955 + }, + { + "epoch": 0.7235572374645223, + "grad_norm": 2.2357521057128906, + "learning_rate": 1.7469367027018134e-05, + "loss": 1.253, + "step": 956 + }, + { + "epoch": 0.7243140964995269, + "grad_norm": 2.3083794116973877, + "learning_rate": 1.746402674646107e-05, + "loss": 1.2062, + "step": 957 + }, + { + "epoch": 0.7250709555345317, + "grad_norm": 2.087985038757324, + "learning_rate": 1.745868165535078e-05, + "loss": 1.2146, + "step": 958 + }, + { + "epoch": 0.7258278145695364, + "grad_norm": 2.1703999042510986, + "learning_rate": 1.7453331757132627e-05, + "loss": 1.2593, + "step": 959 + }, + { + "epoch": 0.7265846736045412, + "grad_norm": 2.644440174102783, + "learning_rate": 1.7447977055255076e-05, + "loss": 1.2744, + "step": 960 + }, + { + "epoch": 0.7273415326395459, + "grad_norm": 2.2902777194976807, + "learning_rate": 1.744261755316968e-05, + "loss": 1.2336, + "step": 961 + }, + { + "epoch": 0.7280983916745506, + "grad_norm": 2.1898083686828613, + "learning_rate": 1.7437253254331103e-05, + "loss": 1.1872, + "step": 962 + }, + { + "epoch": 0.7288552507095554, + "grad_norm": 2.192096710205078, + "learning_rate": 1.7431884162197076e-05, + "loss": 1.1904, + "step": 963 + }, + { + "epoch": 0.72961210974456, + "grad_norm": 2.344484806060791, + "learning_rate": 1.7426510280228447e-05, + "loss": 1.2086, + "step": 964 + }, + { + "epoch": 0.7303689687795648, + "grad_norm": 2.4366836547851562, + "learning_rate": 1.742113161188913e-05, + "loss": 1.2367, + "step": 965 + }, + { + "epoch": 0.7311258278145696, + "grad_norm": 2.3846473693847656, + "learning_rate": 1.7415748160646136e-05, + "loss": 1.2182, + "step": 966 + }, + { + "epoch": 0.7318826868495742, + "grad_norm": 2.432124614715576, + "learning_rate": 1.7410359929969555e-05, + "loss": 1.2345, + "step": 967 + }, + { + "epoch": 0.732639545884579, + "grad_norm": 2.427494525909424, + "learning_rate": 1.7404966923332558e-05, + "loss": 1.2284, + "step": 968 + }, + { + "epoch": 0.7333964049195837, + "grad_norm": 2.3191261291503906, + "learning_rate": 1.73995691442114e-05, + "loss": 1.2092, + "step": 969 + }, + { + "epoch": 0.7341532639545885, + "grad_norm": 2.3739922046661377, + "learning_rate": 1.7394166596085393e-05, + "loss": 1.2276, + "step": 970 + }, + { + "epoch": 0.7349101229895932, + "grad_norm": 2.29589581489563, + "learning_rate": 1.7388759282436953e-05, + "loss": 1.2422, + "step": 971 + }, + { + "epoch": 0.7356669820245979, + "grad_norm": 2.3834304809570312, + "learning_rate": 1.7383347206751542e-05, + "loss": 1.252, + "step": 972 + }, + { + "epoch": 0.7364238410596027, + "grad_norm": 2.2572319507598877, + "learning_rate": 1.7377930372517705e-05, + "loss": 1.2296, + "step": 973 + }, + { + "epoch": 0.7371807000946073, + "grad_norm": 2.6052353382110596, + "learning_rate": 1.7372508783227052e-05, + "loss": 1.2131, + "step": 974 + }, + { + "epoch": 0.7379375591296121, + "grad_norm": 2.4882845878601074, + "learning_rate": 1.7367082442374255e-05, + "loss": 1.2259, + "step": 975 + }, + { + "epoch": 0.7386944181646169, + "grad_norm": 2.301111936569214, + "learning_rate": 1.7361651353457053e-05, + "loss": 1.191, + "step": 976 + }, + { + "epoch": 0.7394512771996216, + "grad_norm": 2.496601104736328, + "learning_rate": 1.7356215519976236e-05, + "loss": 1.1749, + "step": 977 + }, + { + "epoch": 0.7402081362346263, + "grad_norm": 2.4782116413116455, + "learning_rate": 1.7350774945435667e-05, + "loss": 1.2282, + "step": 978 + }, + { + "epoch": 0.740964995269631, + "grad_norm": 2.3096814155578613, + "learning_rate": 1.7345329633342253e-05, + "loss": 1.1811, + "step": 979 + }, + { + "epoch": 0.7417218543046358, + "grad_norm": 2.5999755859375, + "learning_rate": 1.7339879587205966e-05, + "loss": 1.2081, + "step": 980 + }, + { + "epoch": 0.7424787133396404, + "grad_norm": 2.3727262020111084, + "learning_rate": 1.733442481053981e-05, + "loss": 1.2392, + "step": 981 + }, + { + "epoch": 0.7432355723746452, + "grad_norm": 2.621267318725586, + "learning_rate": 1.7328965306859864e-05, + "loss": 1.1715, + "step": 982 + }, + { + "epoch": 0.74399243140965, + "grad_norm": 2.786910057067871, + "learning_rate": 1.732350107968523e-05, + "loss": 1.2307, + "step": 983 + }, + { + "epoch": 0.7447492904446547, + "grad_norm": 2.5757007598876953, + "learning_rate": 1.7318032132538078e-05, + "loss": 1.204, + "step": 984 + }, + { + "epoch": 0.7455061494796594, + "grad_norm": 2.4591543674468994, + "learning_rate": 1.7312558468943595e-05, + "loss": 1.1665, + "step": 985 + }, + { + "epoch": 0.7462630085146641, + "grad_norm": 2.4593307971954346, + "learning_rate": 1.730708009243003e-05, + "loss": 1.2571, + "step": 986 + }, + { + "epoch": 0.7470198675496689, + "grad_norm": 2.507080554962158, + "learning_rate": 1.7301597006528654e-05, + "loss": 1.2222, + "step": 987 + }, + { + "epoch": 0.7477767265846736, + "grad_norm": 2.445662498474121, + "learning_rate": 1.7296109214773782e-05, + "loss": 1.2066, + "step": 988 + }, + { + "epoch": 0.7485335856196783, + "grad_norm": 2.341787099838257, + "learning_rate": 1.7290616720702768e-05, + "loss": 1.2395, + "step": 989 + }, + { + "epoch": 0.7492904446546831, + "grad_norm": 2.569960832595825, + "learning_rate": 1.728511952785598e-05, + "loss": 1.241, + "step": 990 + }, + { + "epoch": 0.7500473036896877, + "grad_norm": 2.5241215229034424, + "learning_rate": 1.7279617639776836e-05, + "loss": 1.2231, + "step": 991 + }, + { + "epoch": 0.7508041627246925, + "grad_norm": 2.4361581802368164, + "learning_rate": 1.727411106001176e-05, + "loss": 1.2381, + "step": 992 + }, + { + "epoch": 0.7515610217596973, + "grad_norm": 2.3338370323181152, + "learning_rate": 1.7268599792110213e-05, + "loss": 1.2526, + "step": 993 + }, + { + "epoch": 0.752317880794702, + "grad_norm": 2.398029327392578, + "learning_rate": 1.726308383962467e-05, + "loss": 1.2405, + "step": 994 + }, + { + "epoch": 0.7530747398297067, + "grad_norm": 2.437852382659912, + "learning_rate": 1.7257563206110636e-05, + "loss": 1.2553, + "step": 995 + }, + { + "epoch": 0.7538315988647114, + "grad_norm": 2.763335704803467, + "learning_rate": 1.7252037895126622e-05, + "loss": 1.2342, + "step": 996 + }, + { + "epoch": 0.7545884578997162, + "grad_norm": 2.5191261768341064, + "learning_rate": 1.7246507910234162e-05, + "loss": 1.2188, + "step": 997 + }, + { + "epoch": 0.755345316934721, + "grad_norm": 2.5067646503448486, + "learning_rate": 1.72409732549978e-05, + "loss": 1.1707, + "step": 998 + }, + { + "epoch": 0.7561021759697256, + "grad_norm": 2.400637626647949, + "learning_rate": 1.7235433932985092e-05, + "loss": 1.1599, + "step": 999 + }, + { + "epoch": 0.7568590350047304, + "grad_norm": 2.410027027130127, + "learning_rate": 1.7229889947766597e-05, + "loss": 1.2442, + "step": 1000 + }, + { + "epoch": 0.7576158940397351, + "grad_norm": 2.3706107139587402, + "learning_rate": 1.7224341302915885e-05, + "loss": 1.2264, + "step": 1001 + }, + { + "epoch": 0.7583727530747398, + "grad_norm": 2.4948031902313232, + "learning_rate": 1.7218788002009527e-05, + "loss": 1.2505, + "step": 1002 + }, + { + "epoch": 0.7591296121097446, + "grad_norm": 2.4337100982666016, + "learning_rate": 1.7213230048627093e-05, + "loss": 1.2416, + "step": 1003 + }, + { + "epoch": 0.7598864711447493, + "grad_norm": 2.2913546562194824, + "learning_rate": 1.7207667446351165e-05, + "loss": 1.22, + "step": 1004 + }, + { + "epoch": 0.760643330179754, + "grad_norm": 2.4365074634552, + "learning_rate": 1.72021001987673e-05, + "loss": 1.2398, + "step": 1005 + }, + { + "epoch": 0.7614001892147587, + "grad_norm": 2.4662392139434814, + "learning_rate": 1.7196528309464067e-05, + "loss": 1.2372, + "step": 1006 + }, + { + "epoch": 0.7621570482497635, + "grad_norm": 2.5085933208465576, + "learning_rate": 1.719095178203302e-05, + "loss": 1.2408, + "step": 1007 + }, + { + "epoch": 0.7629139072847683, + "grad_norm": 2.447695016860962, + "learning_rate": 1.7185370620068705e-05, + "loss": 1.2062, + "step": 1008 + }, + { + "epoch": 0.7636707663197729, + "grad_norm": 2.8076727390289307, + "learning_rate": 1.717978482716865e-05, + "loss": 1.2051, + "step": 1009 + }, + { + "epoch": 0.7644276253547777, + "grad_norm": 2.6247246265411377, + "learning_rate": 1.7174194406933377e-05, + "loss": 1.1861, + "step": 1010 + }, + { + "epoch": 0.7651844843897824, + "grad_norm": 2.6273937225341797, + "learning_rate": 1.7168599362966382e-05, + "loss": 1.1919, + "step": 1011 + }, + { + "epoch": 0.7659413434247871, + "grad_norm": 2.363234281539917, + "learning_rate": 1.7162999698874144e-05, + "loss": 1.203, + "step": 1012 + }, + { + "epoch": 0.7666982024597918, + "grad_norm": 2.4418020248413086, + "learning_rate": 1.7157395418266125e-05, + "loss": 1.2146, + "step": 1013 + }, + { + "epoch": 0.7674550614947966, + "grad_norm": 2.4737863540649414, + "learning_rate": 1.7151786524754755e-05, + "loss": 1.2149, + "step": 1014 + }, + { + "epoch": 0.7682119205298014, + "grad_norm": 2.2613844871520996, + "learning_rate": 1.7146173021955444e-05, + "loss": 1.2276, + "step": 1015 + }, + { + "epoch": 0.768968779564806, + "grad_norm": 2.626579523086548, + "learning_rate": 1.714055491348657e-05, + "loss": 1.2384, + "step": 1016 + }, + { + "epoch": 0.7697256385998108, + "grad_norm": 2.406792163848877, + "learning_rate": 1.7134932202969482e-05, + "loss": 1.2285, + "step": 1017 + }, + { + "epoch": 0.7704824976348155, + "grad_norm": 2.456866979598999, + "learning_rate": 1.7129304894028483e-05, + "loss": 1.1853, + "step": 1018 + }, + { + "epoch": 0.7712393566698202, + "grad_norm": 2.5044846534729004, + "learning_rate": 1.7123672990290864e-05, + "loss": 1.212, + "step": 1019 + }, + { + "epoch": 0.771996215704825, + "grad_norm": 2.1986587047576904, + "learning_rate": 1.7118036495386856e-05, + "loss": 1.2106, + "step": 1020 + }, + { + "epoch": 0.7727530747398297, + "grad_norm": 2.4531362056732178, + "learning_rate": 1.7112395412949662e-05, + "loss": 1.2466, + "step": 1021 + }, + { + "epoch": 0.7735099337748345, + "grad_norm": 2.2251899242401123, + "learning_rate": 1.7106749746615437e-05, + "loss": 1.1857, + "step": 1022 + }, + { + "epoch": 0.7742667928098391, + "grad_norm": 2.2850799560546875, + "learning_rate": 1.7101099500023287e-05, + "loss": 1.2499, + "step": 1023 + }, + { + "epoch": 0.7750236518448439, + "grad_norm": 2.3555169105529785, + "learning_rate": 1.709544467681528e-05, + "loss": 1.2139, + "step": 1024 + }, + { + "epoch": 0.7757805108798487, + "grad_norm": 2.54640531539917, + "learning_rate": 1.7089785280636428e-05, + "loss": 1.2121, + "step": 1025 + }, + { + "epoch": 0.7765373699148533, + "grad_norm": 2.3403546810150146, + "learning_rate": 1.708412131513469e-05, + "loss": 1.2294, + "step": 1026 + }, + { + "epoch": 0.7772942289498581, + "grad_norm": 2.450343370437622, + "learning_rate": 1.707845278396097e-05, + "loss": 1.2224, + "step": 1027 + }, + { + "epoch": 0.7780510879848628, + "grad_norm": 2.4089951515197754, + "learning_rate": 1.707277969076912e-05, + "loss": 1.2295, + "step": 1028 + }, + { + "epoch": 0.7788079470198676, + "grad_norm": 2.244898796081543, + "learning_rate": 1.7067102039215928e-05, + "loss": 1.2656, + "step": 1029 + }, + { + "epoch": 0.7795648060548723, + "grad_norm": 2.2754669189453125, + "learning_rate": 1.7061419832961122e-05, + "loss": 1.2106, + "step": 1030 + }, + { + "epoch": 0.780321665089877, + "grad_norm": 2.3827311992645264, + "learning_rate": 1.7055733075667368e-05, + "loss": 1.1916, + "step": 1031 + }, + { + "epoch": 0.7810785241248818, + "grad_norm": 3.2731504440307617, + "learning_rate": 1.7050041771000258e-05, + "loss": 1.2265, + "step": 1032 + }, + { + "epoch": 0.7818353831598864, + "grad_norm": 2.48207950592041, + "learning_rate": 1.7044345922628326e-05, + "loss": 1.2305, + "step": 1033 + }, + { + "epoch": 0.7825922421948912, + "grad_norm": 2.3561174869537354, + "learning_rate": 1.703864553422302e-05, + "loss": 1.2191, + "step": 1034 + }, + { + "epoch": 0.783349101229896, + "grad_norm": 2.4696364402770996, + "learning_rate": 1.703294060945873e-05, + "loss": 1.2354, + "step": 1035 + }, + { + "epoch": 0.7841059602649006, + "grad_norm": 2.214374542236328, + "learning_rate": 1.7027231152012765e-05, + "loss": 1.1459, + "step": 1036 + }, + { + "epoch": 0.7848628192999054, + "grad_norm": 2.8190994262695312, + "learning_rate": 1.7021517165565352e-05, + "loss": 1.2289, + "step": 1037 + }, + { + "epoch": 0.7856196783349101, + "grad_norm": 2.3381307125091553, + "learning_rate": 1.701579865379964e-05, + "loss": 1.2142, + "step": 1038 + }, + { + "epoch": 0.7863765373699149, + "grad_norm": 2.4270827770233154, + "learning_rate": 1.7010075620401693e-05, + "loss": 1.175, + "step": 1039 + }, + { + "epoch": 0.7871333964049196, + "grad_norm": 2.5499768257141113, + "learning_rate": 1.7004348069060487e-05, + "loss": 1.1907, + "step": 1040 + }, + { + "epoch": 0.7878902554399243, + "grad_norm": 2.8665435314178467, + "learning_rate": 1.6998616003467923e-05, + "loss": 1.232, + "step": 1041 + }, + { + "epoch": 0.7886471144749291, + "grad_norm": 2.443026065826416, + "learning_rate": 1.6992879427318798e-05, + "loss": 1.206, + "step": 1042 + }, + { + "epoch": 0.7894039735099337, + "grad_norm": 2.408712148666382, + "learning_rate": 1.6987138344310822e-05, + "loss": 1.1984, + "step": 1043 + }, + { + "epoch": 0.7901608325449385, + "grad_norm": 2.5489931106567383, + "learning_rate": 1.6981392758144616e-05, + "loss": 1.27, + "step": 1044 + }, + { + "epoch": 0.7909176915799432, + "grad_norm": 2.2722368240356445, + "learning_rate": 1.6975642672523684e-05, + "loss": 1.2572, + "step": 1045 + }, + { + "epoch": 0.791674550614948, + "grad_norm": 2.921919822692871, + "learning_rate": 1.6969888091154452e-05, + "loss": 1.2433, + "step": 1046 + }, + { + "epoch": 0.7924314096499527, + "grad_norm": 2.41582989692688, + "learning_rate": 1.6964129017746236e-05, + "loss": 1.2537, + "step": 1047 + }, + { + "epoch": 0.7931882686849574, + "grad_norm": 2.557302474975586, + "learning_rate": 1.695836545601125e-05, + "loss": 1.2248, + "step": 1048 + }, + { + "epoch": 0.7939451277199622, + "grad_norm": 2.4108498096466064, + "learning_rate": 1.6952597409664587e-05, + "loss": 1.2337, + "step": 1049 + }, + { + "epoch": 0.7947019867549668, + "grad_norm": 2.3067305088043213, + "learning_rate": 1.694682488242425e-05, + "loss": 1.1974, + "step": 1050 + }, + { + "epoch": 0.7954588457899716, + "grad_norm": 2.3873379230499268, + "learning_rate": 1.6941047878011122e-05, + "loss": 1.204, + "step": 1051 + }, + { + "epoch": 0.7962157048249764, + "grad_norm": 2.3166935443878174, + "learning_rate": 1.6935266400148963e-05, + "loss": 1.1585, + "step": 1052 + }, + { + "epoch": 0.796972563859981, + "grad_norm": 2.312579870223999, + "learning_rate": 1.6929480452564438e-05, + "loss": 1.2178, + "step": 1053 + }, + { + "epoch": 0.7977294228949858, + "grad_norm": 2.3897957801818848, + "learning_rate": 1.6923690038987075e-05, + "loss": 1.1867, + "step": 1054 + }, + { + "epoch": 0.7984862819299905, + "grad_norm": 2.5109200477600098, + "learning_rate": 1.6917895163149282e-05, + "loss": 1.2219, + "step": 1055 + }, + { + "epoch": 0.7992431409649953, + "grad_norm": 2.4277844429016113, + "learning_rate": 1.6912095828786353e-05, + "loss": 1.2224, + "step": 1056 + }, + { + "epoch": 0.8, + "grad_norm": 2.285210609436035, + "learning_rate": 1.6906292039636452e-05, + "loss": 1.233, + "step": 1057 + }, + { + "epoch": 0.8007568590350047, + "grad_norm": 2.475517511367798, + "learning_rate": 1.690048379944061e-05, + "loss": 1.2606, + "step": 1058 + }, + { + "epoch": 0.8015137180700095, + "grad_norm": 2.2740111351013184, + "learning_rate": 1.6894671111942733e-05, + "loss": 1.1516, + "step": 1059 + }, + { + "epoch": 0.8022705771050141, + "grad_norm": 2.777266263961792, + "learning_rate": 1.6888853980889583e-05, + "loss": 1.2257, + "step": 1060 + }, + { + "epoch": 0.8030274361400189, + "grad_norm": 2.4774162769317627, + "learning_rate": 1.6883032410030796e-05, + "loss": 1.2161, + "step": 1061 + }, + { + "epoch": 0.8037842951750237, + "grad_norm": 2.4283878803253174, + "learning_rate": 1.6877206403118875e-05, + "loss": 1.2258, + "step": 1062 + }, + { + "epoch": 0.8045411542100284, + "grad_norm": 2.2770519256591797, + "learning_rate": 1.687137596390917e-05, + "loss": 1.2209, + "step": 1063 + }, + { + "epoch": 0.8052980132450331, + "grad_norm": 2.4034667015075684, + "learning_rate": 1.6865541096159895e-05, + "loss": 1.1773, + "step": 1064 + }, + { + "epoch": 0.8060548722800378, + "grad_norm": 2.2633402347564697, + "learning_rate": 1.6859701803632117e-05, + "loss": 1.2552, + "step": 1065 + }, + { + "epoch": 0.8068117313150426, + "grad_norm": 2.3959109783172607, + "learning_rate": 1.6853858090089753e-05, + "loss": 1.27, + "step": 1066 + }, + { + "epoch": 0.8075685903500474, + "grad_norm": 2.4210898876190186, + "learning_rate": 1.6848009959299575e-05, + "loss": 1.2173, + "step": 1067 + }, + { + "epoch": 0.808325449385052, + "grad_norm": 2.3308327198028564, + "learning_rate": 1.6842157415031194e-05, + "loss": 1.2738, + "step": 1068 + }, + { + "epoch": 0.8090823084200568, + "grad_norm": 3.167160987854004, + "learning_rate": 1.683630046105707e-05, + "loss": 1.2349, + "step": 1069 + }, + { + "epoch": 0.8098391674550615, + "grad_norm": 2.1552276611328125, + "learning_rate": 1.6830439101152513e-05, + "loss": 1.2436, + "step": 1070 + }, + { + "epoch": 0.8105960264900662, + "grad_norm": 2.5437731742858887, + "learning_rate": 1.682457333909566e-05, + "loss": 1.2039, + "step": 1071 + }, + { + "epoch": 0.811352885525071, + "grad_norm": 2.1334102153778076, + "learning_rate": 1.6818703178667496e-05, + "loss": 1.2173, + "step": 1072 + }, + { + "epoch": 0.8121097445600757, + "grad_norm": 2.5743660926818848, + "learning_rate": 1.6812828623651832e-05, + "loss": 1.2132, + "step": 1073 + }, + { + "epoch": 0.8128666035950805, + "grad_norm": 2.4903461933135986, + "learning_rate": 1.6806949677835328e-05, + "loss": 1.2428, + "step": 1074 + }, + { + "epoch": 0.8136234626300851, + "grad_norm": 2.2703421115875244, + "learning_rate": 1.6801066345007447e-05, + "loss": 1.1828, + "step": 1075 + }, + { + "epoch": 0.8143803216650899, + "grad_norm": 2.615246295928955, + "learning_rate": 1.6795178628960508e-05, + "loss": 1.2361, + "step": 1076 + }, + { + "epoch": 0.8151371807000946, + "grad_norm": 2.6063549518585205, + "learning_rate": 1.6789286533489635e-05, + "loss": 1.2586, + "step": 1077 + }, + { + "epoch": 0.8158940397350993, + "grad_norm": 2.725470542907715, + "learning_rate": 1.6783390062392788e-05, + "loss": 1.2166, + "step": 1078 + }, + { + "epoch": 0.8166508987701041, + "grad_norm": 2.576597213745117, + "learning_rate": 1.6777489219470743e-05, + "loss": 1.231, + "step": 1079 + }, + { + "epoch": 0.8174077578051088, + "grad_norm": 2.37703537940979, + "learning_rate": 1.677158400852708e-05, + "loss": 1.2663, + "step": 1080 + }, + { + "epoch": 0.8181646168401135, + "grad_norm": 2.3021481037139893, + "learning_rate": 1.6765674433368232e-05, + "loss": 1.2091, + "step": 1081 + }, + { + "epoch": 0.8189214758751182, + "grad_norm": 2.4437525272369385, + "learning_rate": 1.67597604978034e-05, + "loss": 1.2123, + "step": 1082 + }, + { + "epoch": 0.819678334910123, + "grad_norm": 2.470407724380493, + "learning_rate": 1.6753842205644628e-05, + "loss": 1.1948, + "step": 1083 + }, + { + "epoch": 0.8204351939451278, + "grad_norm": 2.5628767013549805, + "learning_rate": 1.6747919560706752e-05, + "loss": 1.2347, + "step": 1084 + }, + { + "epoch": 0.8211920529801324, + "grad_norm": 2.5520646572113037, + "learning_rate": 1.6741992566807416e-05, + "loss": 1.2319, + "step": 1085 + }, + { + "epoch": 0.8219489120151372, + "grad_norm": 2.4275975227355957, + "learning_rate": 1.673606122776708e-05, + "loss": 1.1666, + "step": 1086 + }, + { + "epoch": 0.8227057710501419, + "grad_norm": 2.803802728652954, + "learning_rate": 1.6730125547408984e-05, + "loss": 1.1861, + "step": 1087 + }, + { + "epoch": 0.8234626300851466, + "grad_norm": 3.4586920738220215, + "learning_rate": 1.6724185529559185e-05, + "loss": 1.2278, + "step": 1088 + }, + { + "epoch": 0.8242194891201514, + "grad_norm": 2.3933305740356445, + "learning_rate": 1.6718241178046526e-05, + "loss": 1.2148, + "step": 1089 + }, + { + "epoch": 0.8249763481551561, + "grad_norm": 2.64758038520813, + "learning_rate": 1.671229249670264e-05, + "loss": 1.2138, + "step": 1090 + }, + { + "epoch": 0.8257332071901609, + "grad_norm": 2.389108896255493, + "learning_rate": 1.6706339489361962e-05, + "loss": 1.2295, + "step": 1091 + }, + { + "epoch": 0.8264900662251655, + "grad_norm": 2.5130155086517334, + "learning_rate": 1.6700382159861705e-05, + "loss": 1.196, + "step": 1092 + }, + { + "epoch": 0.8272469252601703, + "grad_norm": 2.287849187850952, + "learning_rate": 1.6694420512041878e-05, + "loss": 1.2286, + "step": 1093 + }, + { + "epoch": 0.8280037842951751, + "grad_norm": 2.378422737121582, + "learning_rate": 1.6688454549745263e-05, + "loss": 1.2518, + "step": 1094 + }, + { + "epoch": 0.8287606433301797, + "grad_norm": 2.3797566890716553, + "learning_rate": 1.6682484276817433e-05, + "loss": 1.2228, + "step": 1095 + }, + { + "epoch": 0.8295175023651845, + "grad_norm": 2.276672124862671, + "learning_rate": 1.667650969710673e-05, + "loss": 1.2281, + "step": 1096 + }, + { + "epoch": 0.8302743614001892, + "grad_norm": 2.550900459289551, + "learning_rate": 1.6670530814464284e-05, + "loss": 1.2279, + "step": 1097 + }, + { + "epoch": 0.831031220435194, + "grad_norm": 2.5035128593444824, + "learning_rate": 1.6664547632743987e-05, + "loss": 1.1628, + "step": 1098 + }, + { + "epoch": 0.8317880794701987, + "grad_norm": 2.664567708969116, + "learning_rate": 1.6658560155802506e-05, + "loss": 1.187, + "step": 1099 + }, + { + "epoch": 0.8325449385052034, + "grad_norm": 2.5373306274414062, + "learning_rate": 1.665256838749928e-05, + "loss": 1.2422, + "step": 1100 + }, + { + "epoch": 0.8333017975402082, + "grad_norm": 2.7911324501037598, + "learning_rate": 1.664657233169651e-05, + "loss": 1.1982, + "step": 1101 + }, + { + "epoch": 0.8340586565752128, + "grad_norm": 2.663367509841919, + "learning_rate": 1.664057199225916e-05, + "loss": 1.2578, + "step": 1102 + }, + { + "epoch": 0.8348155156102176, + "grad_norm": 2.486424684524536, + "learning_rate": 1.663456737305496e-05, + "loss": 1.2106, + "step": 1103 + }, + { + "epoch": 0.8355723746452224, + "grad_norm": 2.503634214401245, + "learning_rate": 1.66285584779544e-05, + "loss": 1.2192, + "step": 1104 + }, + { + "epoch": 0.836329233680227, + "grad_norm": 2.679033041000366, + "learning_rate": 1.6622545310830712e-05, + "loss": 1.204, + "step": 1105 + }, + { + "epoch": 0.8370860927152318, + "grad_norm": 2.7814950942993164, + "learning_rate": 1.66165278755599e-05, + "loss": 1.2133, + "step": 1106 + }, + { + "epoch": 0.8378429517502365, + "grad_norm": 2.5719947814941406, + "learning_rate": 1.6610506176020707e-05, + "loss": 1.2457, + "step": 1107 + }, + { + "epoch": 0.8385998107852413, + "grad_norm": 3.662503957748413, + "learning_rate": 1.660448021609463e-05, + "loss": 1.2288, + "step": 1108 + }, + { + "epoch": 0.8393566698202459, + "grad_norm": 2.62904691696167, + "learning_rate": 1.659844999966591e-05, + "loss": 1.2202, + "step": 1109 + }, + { + "epoch": 0.8401135288552507, + "grad_norm": 2.6756417751312256, + "learning_rate": 1.659241553062154e-05, + "loss": 1.2438, + "step": 1110 + }, + { + "epoch": 0.8408703878902555, + "grad_norm": 2.762983798980713, + "learning_rate": 1.6586376812851233e-05, + "loss": 1.1499, + "step": 1111 + }, + { + "epoch": 0.8416272469252601, + "grad_norm": 2.4654974937438965, + "learning_rate": 1.6580333850247462e-05, + "loss": 1.2398, + "step": 1112 + }, + { + "epoch": 0.8423841059602649, + "grad_norm": 2.5800747871398926, + "learning_rate": 1.657428664670543e-05, + "loss": 1.2564, + "step": 1113 + }, + { + "epoch": 0.8431409649952696, + "grad_norm": 2.4179458618164062, + "learning_rate": 1.6568235206123073e-05, + "loss": 1.1874, + "step": 1114 + }, + { + "epoch": 0.8438978240302744, + "grad_norm": 2.4252541065216064, + "learning_rate": 1.6562179532401053e-05, + "loss": 1.2568, + "step": 1115 + }, + { + "epoch": 0.8446546830652791, + "grad_norm": 2.5447540283203125, + "learning_rate": 1.6556119629442764e-05, + "loss": 1.1884, + "step": 1116 + }, + { + "epoch": 0.8454115421002838, + "grad_norm": 2.5056309700012207, + "learning_rate": 1.655005550115433e-05, + "loss": 1.2665, + "step": 1117 + }, + { + "epoch": 0.8461684011352886, + "grad_norm": 2.7429358959198, + "learning_rate": 1.65439871514446e-05, + "loss": 1.1835, + "step": 1118 + }, + { + "epoch": 0.8469252601702932, + "grad_norm": 2.6835551261901855, + "learning_rate": 1.653791458422513e-05, + "loss": 1.2781, + "step": 1119 + }, + { + "epoch": 0.847682119205298, + "grad_norm": 2.8627474308013916, + "learning_rate": 1.653183780341021e-05, + "loss": 1.1931, + "step": 1120 + }, + { + "epoch": 0.8484389782403028, + "grad_norm": 2.758310556411743, + "learning_rate": 1.652575681291684e-05, + "loss": 1.1944, + "step": 1121 + }, + { + "epoch": 0.8491958372753075, + "grad_norm": 2.761715888977051, + "learning_rate": 1.6519671616664734e-05, + "loss": 1.2457, + "step": 1122 + }, + { + "epoch": 0.8499526963103122, + "grad_norm": 2.8214142322540283, + "learning_rate": 1.6513582218576315e-05, + "loss": 1.2203, + "step": 1123 + }, + { + "epoch": 0.8507095553453169, + "grad_norm": 2.6501047611236572, + "learning_rate": 1.6507488622576712e-05, + "loss": 1.2591, + "step": 1124 + }, + { + "epoch": 0.8514664143803217, + "grad_norm": 2.4939935207366943, + "learning_rate": 1.6501390832593777e-05, + "loss": 1.2205, + "step": 1125 + }, + { + "epoch": 0.8522232734153264, + "grad_norm": 2.5232126712799072, + "learning_rate": 1.6495288852558036e-05, + "loss": 1.2055, + "step": 1126 + }, + { + "epoch": 0.8529801324503311, + "grad_norm": 2.805695056915283, + "learning_rate": 1.6489182686402753e-05, + "loss": 1.2069, + "step": 1127 + }, + { + "epoch": 0.8537369914853359, + "grad_norm": 2.588597059249878, + "learning_rate": 1.6483072338063844e-05, + "loss": 1.1991, + "step": 1128 + }, + { + "epoch": 0.8544938505203405, + "grad_norm": 2.632336378097534, + "learning_rate": 1.6476957811479966e-05, + "loss": 1.2556, + "step": 1129 + }, + { + "epoch": 0.8552507095553453, + "grad_norm": 2.6084611415863037, + "learning_rate": 1.6470839110592445e-05, + "loss": 1.2257, + "step": 1130 + }, + { + "epoch": 0.8560075685903501, + "grad_norm": 2.4824182987213135, + "learning_rate": 1.6464716239345296e-05, + "loss": 1.1898, + "step": 1131 + }, + { + "epoch": 0.8567644276253548, + "grad_norm": 2.5742006301879883, + "learning_rate": 1.6458589201685235e-05, + "loss": 1.2024, + "step": 1132 + }, + { + "epoch": 0.8575212866603595, + "grad_norm": 2.2470591068267822, + "learning_rate": 1.6452458001561655e-05, + "loss": 1.2154, + "step": 1133 + }, + { + "epoch": 0.8582781456953642, + "grad_norm": 2.5146355628967285, + "learning_rate": 1.6446322642926636e-05, + "loss": 1.2202, + "step": 1134 + }, + { + "epoch": 0.859035004730369, + "grad_norm": 2.580735683441162, + "learning_rate": 1.644018312973493e-05, + "loss": 1.1595, + "step": 1135 + }, + { + "epoch": 0.8597918637653738, + "grad_norm": 2.558544635772705, + "learning_rate": 1.6434039465943984e-05, + "loss": 1.2048, + "step": 1136 + }, + { + "epoch": 0.8605487228003784, + "grad_norm": 2.5136754512786865, + "learning_rate": 1.64278916555139e-05, + "loss": 1.2003, + "step": 1137 + }, + { + "epoch": 0.8613055818353832, + "grad_norm": 2.7524209022521973, + "learning_rate": 1.6421739702407468e-05, + "loss": 1.1862, + "step": 1138 + }, + { + "epoch": 0.8620624408703879, + "grad_norm": 2.519251585006714, + "learning_rate": 1.6415583610590144e-05, + "loss": 1.2471, + "step": 1139 + }, + { + "epoch": 0.8628192999053926, + "grad_norm": 2.7237823009490967, + "learning_rate": 1.6409423384030046e-05, + "loss": 1.1878, + "step": 1140 + }, + { + "epoch": 0.8635761589403973, + "grad_norm": 2.595668315887451, + "learning_rate": 1.6403259026697967e-05, + "loss": 1.2164, + "step": 1141 + }, + { + "epoch": 0.8643330179754021, + "grad_norm": 2.6703858375549316, + "learning_rate": 1.6397090542567356e-05, + "loss": 1.1944, + "step": 1142 + }, + { + "epoch": 0.8650898770104068, + "grad_norm": 2.558354139328003, + "learning_rate": 1.639091793561432e-05, + "loss": 1.2423, + "step": 1143 + }, + { + "epoch": 0.8658467360454115, + "grad_norm": 2.445343494415283, + "learning_rate": 1.6384741209817638e-05, + "loss": 1.1989, + "step": 1144 + }, + { + "epoch": 0.8666035950804163, + "grad_norm": 2.3659980297088623, + "learning_rate": 1.6378560369158724e-05, + "loss": 1.1969, + "step": 1145 + }, + { + "epoch": 0.867360454115421, + "grad_norm": 2.6195647716522217, + "learning_rate": 1.6372375417621654e-05, + "loss": 1.2012, + "step": 1146 + }, + { + "epoch": 0.8681173131504257, + "grad_norm": 2.528627395629883, + "learning_rate": 1.6366186359193155e-05, + "loss": 1.2365, + "step": 1147 + }, + { + "epoch": 0.8688741721854305, + "grad_norm": 2.360337734222412, + "learning_rate": 1.6359993197862604e-05, + "loss": 1.2192, + "step": 1148 + }, + { + "epoch": 0.8696310312204352, + "grad_norm": 2.3621413707733154, + "learning_rate": 1.635379593762201e-05, + "loss": 1.2015, + "step": 1149 + }, + { + "epoch": 0.8703878902554399, + "grad_norm": 2.5273406505584717, + "learning_rate": 1.6347594582466038e-05, + "loss": 1.187, + "step": 1150 + }, + { + "epoch": 0.8711447492904446, + "grad_norm": 2.8172874450683594, + "learning_rate": 1.6341389136391985e-05, + "loss": 1.2271, + "step": 1151 + }, + { + "epoch": 0.8719016083254494, + "grad_norm": 2.3418102264404297, + "learning_rate": 1.6335179603399788e-05, + "loss": 1.2358, + "step": 1152 + }, + { + "epoch": 0.8726584673604542, + "grad_norm": 2.415493965148926, + "learning_rate": 1.632896598749202e-05, + "loss": 1.2717, + "step": 1153 + }, + { + "epoch": 0.8734153263954588, + "grad_norm": 2.3262200355529785, + "learning_rate": 1.6322748292673875e-05, + "loss": 1.2198, + "step": 1154 + }, + { + "epoch": 0.8741721854304636, + "grad_norm": 2.8730580806732178, + "learning_rate": 1.6316526522953195e-05, + "loss": 1.183, + "step": 1155 + }, + { + "epoch": 0.8749290444654683, + "grad_norm": 2.432713508605957, + "learning_rate": 1.631030068234043e-05, + "loss": 1.2093, + "step": 1156 + }, + { + "epoch": 0.875685903500473, + "grad_norm": 2.6007068157196045, + "learning_rate": 1.630407077484866e-05, + "loss": 1.2506, + "step": 1157 + }, + { + "epoch": 0.8764427625354778, + "grad_norm": 2.785717487335205, + "learning_rate": 1.6297836804493598e-05, + "loss": 1.2073, + "step": 1158 + }, + { + "epoch": 0.8771996215704825, + "grad_norm": 2.498161792755127, + "learning_rate": 1.629159877529356e-05, + "loss": 1.2297, + "step": 1159 + }, + { + "epoch": 0.8779564806054873, + "grad_norm": 2.6516387462615967, + "learning_rate": 1.628535669126948e-05, + "loss": 1.2242, + "step": 1160 + }, + { + "epoch": 0.8787133396404919, + "grad_norm": 2.164231300354004, + "learning_rate": 1.627911055644492e-05, + "loss": 1.242, + "step": 1161 + }, + { + "epoch": 0.8794701986754967, + "grad_norm": 2.1503818035125732, + "learning_rate": 1.6272860374846037e-05, + "loss": 1.2187, + "step": 1162 + }, + { + "epoch": 0.8802270577105015, + "grad_norm": 1.9819633960723877, + "learning_rate": 1.6266606150501608e-05, + "loss": 1.2044, + "step": 1163 + }, + { + "epoch": 0.8809839167455061, + "grad_norm": 2.251472234725952, + "learning_rate": 1.6260347887443e-05, + "loss": 1.2262, + "step": 1164 + }, + { + "epoch": 0.8817407757805109, + "grad_norm": 2.2765519618988037, + "learning_rate": 1.625408558970421e-05, + "loss": 1.185, + "step": 1165 + }, + { + "epoch": 0.8824976348155156, + "grad_norm": 2.4048166275024414, + "learning_rate": 1.6247819261321803e-05, + "loss": 1.1973, + "step": 1166 + }, + { + "epoch": 0.8832544938505204, + "grad_norm": 2.234778881072998, + "learning_rate": 1.624154890633497e-05, + "loss": 1.1795, + "step": 1167 + }, + { + "epoch": 0.8840113528855251, + "grad_norm": 2.2173893451690674, + "learning_rate": 1.623527452878548e-05, + "loss": 1.1897, + "step": 1168 + }, + { + "epoch": 0.8847682119205298, + "grad_norm": 2.3917415142059326, + "learning_rate": 1.6228996132717702e-05, + "loss": 1.2425, + "step": 1169 + }, + { + "epoch": 0.8855250709555346, + "grad_norm": 2.254404306411743, + "learning_rate": 1.62227137221786e-05, + "loss": 1.2495, + "step": 1170 + }, + { + "epoch": 0.8862819299905392, + "grad_norm": 2.1990530490875244, + "learning_rate": 1.6216427301217713e-05, + "loss": 1.1757, + "step": 1171 + }, + { + "epoch": 0.887038789025544, + "grad_norm": 2.3781630992889404, + "learning_rate": 1.6210136873887176e-05, + "loss": 1.2387, + "step": 1172 + }, + { + "epoch": 0.8877956480605487, + "grad_norm": 2.1774098873138428, + "learning_rate": 1.6203842444241703e-05, + "loss": 1.1937, + "step": 1173 + }, + { + "epoch": 0.8885525070955534, + "grad_norm": 2.3653695583343506, + "learning_rate": 1.619754401633858e-05, + "loss": 1.2115, + "step": 1174 + }, + { + "epoch": 0.8893093661305582, + "grad_norm": 2.3032443523406982, + "learning_rate": 1.619124159423769e-05, + "loss": 1.1802, + "step": 1175 + }, + { + "epoch": 0.8900662251655629, + "grad_norm": 2.2687666416168213, + "learning_rate": 1.618493518200147e-05, + "loss": 1.1868, + "step": 1176 + }, + { + "epoch": 0.8908230842005677, + "grad_norm": 2.3060355186462402, + "learning_rate": 1.6178624783694937e-05, + "loss": 1.1933, + "step": 1177 + }, + { + "epoch": 0.8915799432355723, + "grad_norm": 3.4064903259277344, + "learning_rate": 1.6172310403385677e-05, + "loss": 1.234, + "step": 1178 + }, + { + "epoch": 0.8923368022705771, + "grad_norm": 2.568434715270996, + "learning_rate": 1.616599204514385e-05, + "loss": 1.2115, + "step": 1179 + }, + { + "epoch": 0.8930936613055819, + "grad_norm": 2.2627182006835938, + "learning_rate": 1.6159669713042166e-05, + "loss": 1.2229, + "step": 1180 + }, + { + "epoch": 0.8938505203405865, + "grad_norm": 2.2551677227020264, + "learning_rate": 1.615334341115591e-05, + "loss": 1.2481, + "step": 1181 + }, + { + "epoch": 0.8946073793755913, + "grad_norm": 2.4630017280578613, + "learning_rate": 1.6147013143562915e-05, + "loss": 1.1769, + "step": 1182 + }, + { + "epoch": 0.895364238410596, + "grad_norm": 2.2972466945648193, + "learning_rate": 1.6140678914343575e-05, + "loss": 1.2028, + "step": 1183 + }, + { + "epoch": 0.8961210974456008, + "grad_norm": 2.343468189239502, + "learning_rate": 1.6134340727580843e-05, + "loss": 1.2356, + "step": 1184 + }, + { + "epoch": 0.8968779564806055, + "grad_norm": 2.2180895805358887, + "learning_rate": 1.6127998587360208e-05, + "loss": 1.2082, + "step": 1185 + }, + { + "epoch": 0.8976348155156102, + "grad_norm": 2.121718406677246, + "learning_rate": 1.6121652497769727e-05, + "loss": 1.2052, + "step": 1186 + }, + { + "epoch": 0.898391674550615, + "grad_norm": 2.2796201705932617, + "learning_rate": 1.6115302462899982e-05, + "loss": 1.2056, + "step": 1187 + }, + { + "epoch": 0.8991485335856196, + "grad_norm": 2.1909053325653076, + "learning_rate": 1.6108948486844118e-05, + "loss": 1.1556, + "step": 1188 + }, + { + "epoch": 0.8999053926206244, + "grad_norm": 2.4068331718444824, + "learning_rate": 1.610259057369781e-05, + "loss": 1.2258, + "step": 1189 + }, + { + "epoch": 0.9006622516556292, + "grad_norm": 2.168159246444702, + "learning_rate": 1.6096228727559265e-05, + "loss": 1.1805, + "step": 1190 + }, + { + "epoch": 0.9014191106906339, + "grad_norm": 2.129645586013794, + "learning_rate": 1.608986295252924e-05, + "loss": 1.1725, + "step": 1191 + }, + { + "epoch": 0.9021759697256386, + "grad_norm": 2.3025479316711426, + "learning_rate": 1.608349325271101e-05, + "loss": 1.1873, + "step": 1192 + }, + { + "epoch": 0.9029328287606433, + "grad_norm": 2.3402655124664307, + "learning_rate": 1.607711963221039e-05, + "loss": 1.2049, + "step": 1193 + }, + { + "epoch": 0.9036896877956481, + "grad_norm": 2.844715118408203, + "learning_rate": 1.6070742095135722e-05, + "loss": 1.1654, + "step": 1194 + }, + { + "epoch": 0.9044465468306528, + "grad_norm": 2.337291717529297, + "learning_rate": 1.6064360645597862e-05, + "loss": 1.2119, + "step": 1195 + }, + { + "epoch": 0.9052034058656575, + "grad_norm": 2.2666563987731934, + "learning_rate": 1.60579752877102e-05, + "loss": 1.1662, + "step": 1196 + }, + { + "epoch": 0.9059602649006623, + "grad_norm": 2.2546000480651855, + "learning_rate": 1.6051586025588634e-05, + "loss": 1.1612, + "step": 1197 + }, + { + "epoch": 0.906717123935667, + "grad_norm": 2.4789252281188965, + "learning_rate": 1.6045192863351594e-05, + "loss": 1.159, + "step": 1198 + }, + { + "epoch": 0.9074739829706717, + "grad_norm": 2.5757458209991455, + "learning_rate": 1.6038795805120005e-05, + "loss": 1.1359, + "step": 1199 + }, + { + "epoch": 0.9082308420056765, + "grad_norm": 2.5025620460510254, + "learning_rate": 1.603239485501732e-05, + "loss": 1.2513, + "step": 1200 + }, + { + "epoch": 0.9089877010406812, + "grad_norm": 2.6904783248901367, + "learning_rate": 1.6025990017169495e-05, + "loss": 1.2275, + "step": 1201 + }, + { + "epoch": 0.9097445600756859, + "grad_norm": 2.121021270751953, + "learning_rate": 1.6019581295704985e-05, + "loss": 1.2138, + "step": 1202 + }, + { + "epoch": 0.9105014191106906, + "grad_norm": 2.2942512035369873, + "learning_rate": 1.601316869475476e-05, + "loss": 1.207, + "step": 1203 + }, + { + "epoch": 0.9112582781456954, + "grad_norm": 2.4668707847595215, + "learning_rate": 1.6006752218452283e-05, + "loss": 1.2422, + "step": 1204 + }, + { + "epoch": 0.9120151371807, + "grad_norm": 2.536863088607788, + "learning_rate": 1.600033187093351e-05, + "loss": 1.2256, + "step": 1205 + }, + { + "epoch": 0.9127719962157048, + "grad_norm": 3.008856773376465, + "learning_rate": 1.599390765633691e-05, + "loss": 1.2129, + "step": 1206 + }, + { + "epoch": 0.9135288552507096, + "grad_norm": 2.292177438735962, + "learning_rate": 1.5987479578803425e-05, + "loss": 1.2237, + "step": 1207 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 2.4802656173706055, + "learning_rate": 1.59810476424765e-05, + "loss": 1.2307, + "step": 1208 + }, + { + "epoch": 0.915042573320719, + "grad_norm": 2.225219964981079, + "learning_rate": 1.5974611851502064e-05, + "loss": 1.1845, + "step": 1209 + }, + { + "epoch": 0.9157994323557237, + "grad_norm": 2.584470272064209, + "learning_rate": 1.5968172210028525e-05, + "loss": 1.1756, + "step": 1210 + }, + { + "epoch": 0.9165562913907285, + "grad_norm": 2.3518307209014893, + "learning_rate": 1.596172872220679e-05, + "loss": 1.1788, + "step": 1211 + }, + { + "epoch": 0.9173131504257332, + "grad_norm": 2.3497278690338135, + "learning_rate": 1.595528139219021e-05, + "loss": 1.2084, + "step": 1212 + }, + { + "epoch": 0.9180700094607379, + "grad_norm": 2.117664337158203, + "learning_rate": 1.594883022413466e-05, + "loss": 1.1765, + "step": 1213 + }, + { + "epoch": 0.9188268684957427, + "grad_norm": 2.1322619915008545, + "learning_rate": 1.594237522219845e-05, + "loss": 1.1835, + "step": 1214 + }, + { + "epoch": 0.9195837275307474, + "grad_norm": 2.0177836418151855, + "learning_rate": 1.5935916390542377e-05, + "loss": 1.178, + "step": 1215 + }, + { + "epoch": 0.9203405865657521, + "grad_norm": 2.2327425479888916, + "learning_rate": 1.5929453733329713e-05, + "loss": 1.1916, + "step": 1216 + }, + { + "epoch": 0.9210974456007569, + "grad_norm": 2.168905258178711, + "learning_rate": 1.592298725472618e-05, + "loss": 1.2139, + "step": 1217 + }, + { + "epoch": 0.9218543046357616, + "grad_norm": 2.275158166885376, + "learning_rate": 1.591651695889998e-05, + "loss": 1.2014, + "step": 1218 + }, + { + "epoch": 0.9226111636707663, + "grad_norm": 2.153704881668091, + "learning_rate": 1.5910042850021754e-05, + "loss": 1.2219, + "step": 1219 + }, + { + "epoch": 0.923368022705771, + "grad_norm": 2.161616802215576, + "learning_rate": 1.5903564932264624e-05, + "loss": 1.2452, + "step": 1220 + }, + { + "epoch": 0.9241248817407758, + "grad_norm": 2.1606664657592773, + "learning_rate": 1.589708320980416e-05, + "loss": 1.1448, + "step": 1221 + }, + { + "epoch": 0.9248817407757806, + "grad_norm": 2.040039300918579, + "learning_rate": 1.589059768681837e-05, + "loss": 1.235, + "step": 1222 + }, + { + "epoch": 0.9256385998107852, + "grad_norm": 2.2927193641662598, + "learning_rate": 1.5884108367487732e-05, + "loss": 1.19, + "step": 1223 + }, + { + "epoch": 0.92639545884579, + "grad_norm": 2.2096221446990967, + "learning_rate": 1.587761525599516e-05, + "loss": 1.2349, + "step": 1224 + }, + { + "epoch": 0.9271523178807947, + "grad_norm": 2.1982614994049072, + "learning_rate": 1.5871118356526017e-05, + "loss": 1.206, + "step": 1225 + }, + { + "epoch": 0.9279091769157994, + "grad_norm": 2.2477710247039795, + "learning_rate": 1.5864617673268096e-05, + "loss": 1.2044, + "step": 1226 + }, + { + "epoch": 0.9286660359508042, + "grad_norm": 2.126891851425171, + "learning_rate": 1.5858113210411646e-05, + "loss": 1.1685, + "step": 1227 + }, + { + "epoch": 0.9294228949858089, + "grad_norm": 2.6382102966308594, + "learning_rate": 1.585160497214935e-05, + "loss": 1.2247, + "step": 1228 + }, + { + "epoch": 0.9301797540208137, + "grad_norm": 2.1951191425323486, + "learning_rate": 1.5845092962676306e-05, + "loss": 1.1517, + "step": 1229 + }, + { + "epoch": 0.9309366130558183, + "grad_norm": 2.299997091293335, + "learning_rate": 1.5838577186190064e-05, + "loss": 1.2327, + "step": 1230 + }, + { + "epoch": 0.9316934720908231, + "grad_norm": 2.69441556930542, + "learning_rate": 1.5832057646890594e-05, + "loss": 1.1622, + "step": 1231 + }, + { + "epoch": 0.9324503311258279, + "grad_norm": 2.3268439769744873, + "learning_rate": 1.582553434898029e-05, + "loss": 1.2181, + "step": 1232 + }, + { + "epoch": 0.9332071901608325, + "grad_norm": 2.1350252628326416, + "learning_rate": 1.5819007296663974e-05, + "loss": 1.1818, + "step": 1233 + }, + { + "epoch": 0.9339640491958373, + "grad_norm": 2.0754928588867188, + "learning_rate": 1.5812476494148876e-05, + "loss": 1.1847, + "step": 1234 + }, + { + "epoch": 0.934720908230842, + "grad_norm": 2.48238205909729, + "learning_rate": 1.5805941945644658e-05, + "loss": 1.222, + "step": 1235 + }, + { + "epoch": 0.9354777672658467, + "grad_norm": 2.202993154525757, + "learning_rate": 1.579940365536339e-05, + "loss": 1.2381, + "step": 1236 + }, + { + "epoch": 0.9362346263008514, + "grad_norm": 2.424055337905884, + "learning_rate": 1.5792861627519554e-05, + "loss": 1.2035, + "step": 1237 + }, + { + "epoch": 0.9369914853358562, + "grad_norm": 2.270042896270752, + "learning_rate": 1.578631586633004e-05, + "loss": 1.2268, + "step": 1238 + }, + { + "epoch": 0.937748344370861, + "grad_norm": 2.38864803314209, + "learning_rate": 1.5779766376014146e-05, + "loss": 1.2202, + "step": 1239 + }, + { + "epoch": 0.9385052034058656, + "grad_norm": 2.137854814529419, + "learning_rate": 1.5773213160793574e-05, + "loss": 1.2246, + "step": 1240 + }, + { + "epoch": 0.9392620624408704, + "grad_norm": 2.5035834312438965, + "learning_rate": 1.5766656224892424e-05, + "loss": 1.2246, + "step": 1241 + }, + { + "epoch": 0.9400189214758751, + "grad_norm": 2.498552083969116, + "learning_rate": 1.5760095572537207e-05, + "loss": 1.2037, + "step": 1242 + }, + { + "epoch": 0.9407757805108798, + "grad_norm": 2.0278542041778564, + "learning_rate": 1.5753531207956806e-05, + "loss": 1.2197, + "step": 1243 + }, + { + "epoch": 0.9415326395458846, + "grad_norm": 2.1201868057250977, + "learning_rate": 1.5746963135382522e-05, + "loss": 1.1557, + "step": 1244 + }, + { + "epoch": 0.9422894985808893, + "grad_norm": 2.480867385864258, + "learning_rate": 1.574039135904802e-05, + "loss": 1.2006, + "step": 1245 + }, + { + "epoch": 0.9430463576158941, + "grad_norm": 2.257807970046997, + "learning_rate": 1.573381588318938e-05, + "loss": 1.2235, + "step": 1246 + }, + { + "epoch": 0.9438032166508987, + "grad_norm": 2.2047722339630127, + "learning_rate": 1.5727236712045053e-05, + "loss": 1.1904, + "step": 1247 + }, + { + "epoch": 0.9445600756859035, + "grad_norm": 2.2862167358398438, + "learning_rate": 1.5720653849855862e-05, + "loss": 1.2388, + "step": 1248 + }, + { + "epoch": 0.9453169347209083, + "grad_norm": 2.2071452140808105, + "learning_rate": 1.571406730086503e-05, + "loss": 1.1967, + "step": 1249 + }, + { + "epoch": 0.9460737937559129, + "grad_norm": 2.1805355548858643, + "learning_rate": 1.5707477069318143e-05, + "loss": 1.1927, + "step": 1250 + }, + { + "epoch": 0.9468306527909177, + "grad_norm": 2.156611204147339, + "learning_rate": 1.5700883159463162e-05, + "loss": 1.2216, + "step": 1251 + }, + { + "epoch": 0.9475875118259224, + "grad_norm": 2.2290961742401123, + "learning_rate": 1.5694285575550416e-05, + "loss": 1.2116, + "step": 1252 + }, + { + "epoch": 0.9483443708609272, + "grad_norm": 2.0691416263580322, + "learning_rate": 1.568768432183262e-05, + "loss": 1.2077, + "step": 1253 + }, + { + "epoch": 0.9491012298959319, + "grad_norm": 2.2860946655273438, + "learning_rate": 1.568107940256483e-05, + "loss": 1.2392, + "step": 1254 + }, + { + "epoch": 0.9498580889309366, + "grad_norm": 2.3357367515563965, + "learning_rate": 1.567447082200448e-05, + "loss": 1.2469, + "step": 1255 + }, + { + "epoch": 0.9506149479659414, + "grad_norm": 2.224269151687622, + "learning_rate": 1.566785858441136e-05, + "loss": 1.2553, + "step": 1256 + }, + { + "epoch": 0.951371807000946, + "grad_norm": 2.274747133255005, + "learning_rate": 1.566124269404762e-05, + "loss": 1.1486, + "step": 1257 + }, + { + "epoch": 0.9521286660359508, + "grad_norm": 2.205291271209717, + "learning_rate": 1.5654623155177758e-05, + "loss": 1.228, + "step": 1258 + }, + { + "epoch": 0.9528855250709556, + "grad_norm": 2.059138774871826, + "learning_rate": 1.564799997206863e-05, + "loss": 1.152, + "step": 1259 + }, + { + "epoch": 0.9536423841059603, + "grad_norm": 2.382854700088501, + "learning_rate": 1.564137314898944e-05, + "loss": 1.2262, + "step": 1260 + }, + { + "epoch": 0.954399243140965, + "grad_norm": 2.1926519870758057, + "learning_rate": 1.563474269021174e-05, + "loss": 1.1439, + "step": 1261 + }, + { + "epoch": 0.9551561021759697, + "grad_norm": 2.247835159301758, + "learning_rate": 1.5628108600009414e-05, + "loss": 1.2191, + "step": 1262 + }, + { + "epoch": 0.9559129612109745, + "grad_norm": 2.6202445030212402, + "learning_rate": 1.5621470882658696e-05, + "loss": 1.1955, + "step": 1263 + }, + { + "epoch": 0.9566698202459792, + "grad_norm": 2.1109254360198975, + "learning_rate": 1.5614829542438162e-05, + "loss": 1.2208, + "step": 1264 + }, + { + "epoch": 0.9574266792809839, + "grad_norm": 2.0421035289764404, + "learning_rate": 1.5608184583628723e-05, + "loss": 1.2597, + "step": 1265 + }, + { + "epoch": 0.9581835383159887, + "grad_norm": 2.3527796268463135, + "learning_rate": 1.5601536010513608e-05, + "loss": 1.2437, + "step": 1266 + }, + { + "epoch": 0.9589403973509933, + "grad_norm": 2.39426851272583, + "learning_rate": 1.559488382737839e-05, + "loss": 1.2418, + "step": 1267 + }, + { + "epoch": 0.9596972563859981, + "grad_norm": 2.546283483505249, + "learning_rate": 1.558822803851097e-05, + "loss": 1.2295, + "step": 1268 + }, + { + "epoch": 0.9604541154210028, + "grad_norm": 2.275153160095215, + "learning_rate": 1.558156864820156e-05, + "loss": 1.1896, + "step": 1269 + }, + { + "epoch": 0.9612109744560076, + "grad_norm": 2.1879630088806152, + "learning_rate": 1.5574905660742707e-05, + "loss": 1.1766, + "step": 1270 + }, + { + "epoch": 0.9619678334910123, + "grad_norm": 2.3438518047332764, + "learning_rate": 1.556823908042927e-05, + "loss": 1.1828, + "step": 1271 + }, + { + "epoch": 0.962724692526017, + "grad_norm": 2.673069477081299, + "learning_rate": 1.5561568911558422e-05, + "loss": 1.1909, + "step": 1272 + }, + { + "epoch": 0.9634815515610218, + "grad_norm": 2.3552541732788086, + "learning_rate": 1.5554895158429654e-05, + "loss": 1.2246, + "step": 1273 + }, + { + "epoch": 0.9642384105960264, + "grad_norm": 2.1169943809509277, + "learning_rate": 1.5548217825344765e-05, + "loss": 1.2018, + "step": 1274 + }, + { + "epoch": 0.9649952696310312, + "grad_norm": 2.1841084957122803, + "learning_rate": 1.5541536916607863e-05, + "loss": 1.1496, + "step": 1275 + }, + { + "epoch": 0.965752128666036, + "grad_norm": 2.1429550647735596, + "learning_rate": 1.553485243652536e-05, + "loss": 1.1537, + "step": 1276 + }, + { + "epoch": 0.9665089877010407, + "grad_norm": 2.6029670238494873, + "learning_rate": 1.5528164389405972e-05, + "loss": 1.2348, + "step": 1277 + }, + { + "epoch": 0.9672658467360454, + "grad_norm": 2.105222463607788, + "learning_rate": 1.5521472779560705e-05, + "loss": 1.217, + "step": 1278 + }, + { + "epoch": 0.9680227057710501, + "grad_norm": 2.1541764736175537, + "learning_rate": 1.5514777611302875e-05, + "loss": 1.1844, + "step": 1279 + }, + { + "epoch": 0.9687795648060549, + "grad_norm": 2.0249156951904297, + "learning_rate": 1.5508078888948086e-05, + "loss": 1.2191, + "step": 1280 + }, + { + "epoch": 0.9695364238410596, + "grad_norm": 2.187110185623169, + "learning_rate": 1.550137661681423e-05, + "loss": 1.1811, + "step": 1281 + }, + { + "epoch": 0.9702932828760643, + "grad_norm": 2.27626371383667, + "learning_rate": 1.5494670799221485e-05, + "loss": 1.186, + "step": 1282 + }, + { + "epoch": 0.9710501419110691, + "grad_norm": 2.0355005264282227, + "learning_rate": 1.5487961440492327e-05, + "loss": 1.2338, + "step": 1283 + }, + { + "epoch": 0.9718070009460738, + "grad_norm": 2.126351833343506, + "learning_rate": 1.54812485449515e-05, + "loss": 1.2129, + "step": 1284 + }, + { + "epoch": 0.9725638599810785, + "grad_norm": 2.150451421737671, + "learning_rate": 1.5474532116926037e-05, + "loss": 1.1812, + "step": 1285 + }, + { + "epoch": 0.9733207190160833, + "grad_norm": 2.0796091556549072, + "learning_rate": 1.5467812160745245e-05, + "loss": 1.2273, + "step": 1286 + }, + { + "epoch": 0.974077578051088, + "grad_norm": 2.349214792251587, + "learning_rate": 1.5461088680740702e-05, + "loss": 1.2286, + "step": 1287 + }, + { + "epoch": 0.9748344370860927, + "grad_norm": 2.1848902702331543, + "learning_rate": 1.545436168124627e-05, + "loss": 1.2239, + "step": 1288 + }, + { + "epoch": 0.9755912961210974, + "grad_norm": 2.261702299118042, + "learning_rate": 1.544763116659806e-05, + "loss": 1.202, + "step": 1289 + }, + { + "epoch": 0.9763481551561022, + "grad_norm": 2.2427971363067627, + "learning_rate": 1.5440897141134464e-05, + "loss": 1.2133, + "step": 1290 + }, + { + "epoch": 0.977105014191107, + "grad_norm": 2.076875686645508, + "learning_rate": 1.5434159609196128e-05, + "loss": 1.2056, + "step": 1291 + }, + { + "epoch": 0.9778618732261116, + "grad_norm": 2.26599383354187, + "learning_rate": 1.542741857512597e-05, + "loss": 1.195, + "step": 1292 + }, + { + "epoch": 0.9786187322611164, + "grad_norm": 2.262747049331665, + "learning_rate": 1.5420674043269152e-05, + "loss": 1.2286, + "step": 1293 + }, + { + "epoch": 0.9793755912961211, + "grad_norm": 2.1384646892547607, + "learning_rate": 1.5413926017973097e-05, + "loss": 1.1843, + "step": 1294 + }, + { + "epoch": 0.9801324503311258, + "grad_norm": 2.3019633293151855, + "learning_rate": 1.540717450358748e-05, + "loss": 1.2474, + "step": 1295 + }, + { + "epoch": 0.9808893093661306, + "grad_norm": 2.6212801933288574, + "learning_rate": 1.5400419504464222e-05, + "loss": 1.2305, + "step": 1296 + }, + { + "epoch": 0.9816461684011353, + "grad_norm": 2.415092706680298, + "learning_rate": 1.5393661024957495e-05, + "loss": 1.2394, + "step": 1297 + }, + { + "epoch": 0.98240302743614, + "grad_norm": 2.392845392227173, + "learning_rate": 1.5386899069423712e-05, + "loss": 1.1971, + "step": 1298 + }, + { + "epoch": 0.9831598864711447, + "grad_norm": 2.291163206100464, + "learning_rate": 1.5380133642221525e-05, + "loss": 1.2004, + "step": 1299 + }, + { + "epoch": 0.9839167455061495, + "grad_norm": 2.191312313079834, + "learning_rate": 1.5373364747711825e-05, + "loss": 1.1677, + "step": 1300 + }, + { + "epoch": 0.9846736045411542, + "grad_norm": 2.2666783332824707, + "learning_rate": 1.536659239025774e-05, + "loss": 1.1656, + "step": 1301 + }, + { + "epoch": 0.9854304635761589, + "grad_norm": 2.578672409057617, + "learning_rate": 1.5359816574224626e-05, + "loss": 1.2021, + "step": 1302 + }, + { + "epoch": 0.9861873226111637, + "grad_norm": 2.1345741748809814, + "learning_rate": 1.5353037303980075e-05, + "loss": 1.2277, + "step": 1303 + }, + { + "epoch": 0.9869441816461684, + "grad_norm": 2.1685898303985596, + "learning_rate": 1.5346254583893895e-05, + "loss": 1.206, + "step": 1304 + }, + { + "epoch": 0.9877010406811731, + "grad_norm": 2.3150031566619873, + "learning_rate": 1.533946841833813e-05, + "loss": 1.1747, + "step": 1305 + }, + { + "epoch": 0.9884578997161778, + "grad_norm": 2.3677496910095215, + "learning_rate": 1.5332678811687034e-05, + "loss": 1.2502, + "step": 1306 + }, + { + "epoch": 0.9892147587511826, + "grad_norm": 2.0479371547698975, + "learning_rate": 1.5325885768317085e-05, + "loss": 1.129, + "step": 1307 + }, + { + "epoch": 0.9899716177861874, + "grad_norm": 2.272096633911133, + "learning_rate": 1.531908929260698e-05, + "loss": 1.1515, + "step": 1308 + }, + { + "epoch": 0.990728476821192, + "grad_norm": 2.233167886734009, + "learning_rate": 1.5312289388937613e-05, + "loss": 1.1576, + "step": 1309 + }, + { + "epoch": 0.9914853358561968, + "grad_norm": 2.329028606414795, + "learning_rate": 1.530548606169211e-05, + "loss": 1.2331, + "step": 1310 + }, + { + "epoch": 0.9922421948912015, + "grad_norm": 2.3802735805511475, + "learning_rate": 1.5298679315255786e-05, + "loss": 1.1576, + "step": 1311 + }, + { + "epoch": 0.9929990539262062, + "grad_norm": 2.45041561126709, + "learning_rate": 1.5291869154016167e-05, + "loss": 1.2241, + "step": 1312 + }, + { + "epoch": 0.993755912961211, + "grad_norm": 2.528601884841919, + "learning_rate": 1.5285055582362975e-05, + "loss": 1.2257, + "step": 1313 + }, + { + "epoch": 0.9945127719962157, + "grad_norm": 2.2096829414367676, + "learning_rate": 1.5278238604688143e-05, + "loss": 1.1959, + "step": 1314 + }, + { + "epoch": 0.9952696310312205, + "grad_norm": 2.218921184539795, + "learning_rate": 1.5271418225385784e-05, + "loss": 1.1444, + "step": 1315 + }, + { + "epoch": 0.9960264900662251, + "grad_norm": 2.563999891281128, + "learning_rate": 1.526459444885221e-05, + "loss": 1.192, + "step": 1316 + }, + { + "epoch": 0.9967833491012299, + "grad_norm": 2.6427245140075684, + "learning_rate": 1.5257767279485934e-05, + "loss": 1.1575, + "step": 1317 + }, + { + "epoch": 0.9975402081362347, + "grad_norm": 2.0441269874572754, + "learning_rate": 1.5250936721687628e-05, + "loss": 1.1636, + "step": 1318 + }, + { + "epoch": 0.9982970671712393, + "grad_norm": 2.560488700866699, + "learning_rate": 1.5244102779860178e-05, + "loss": 1.2198, + "step": 1319 + }, + { + "epoch": 0.9990539262062441, + "grad_norm": 2.5017917156219482, + "learning_rate": 1.5237265458408637e-05, + "loss": 1.2215, + "step": 1320 + }, + { + "epoch": 0.9998107852412488, + "grad_norm": 2.3458917140960693, + "learning_rate": 1.5230424761740234e-05, + "loss": 1.1645, + "step": 1321 + }, + { + "epoch": 1.0005676442762534, + "grad_norm": 2.2212741374969482, + "learning_rate": 1.5223580694264382e-05, + "loss": 1.2054, + "step": 1322 + }, + { + "epoch": 1.0013245033112583, + "grad_norm": 2.394789695739746, + "learning_rate": 1.5216733260392658e-05, + "loss": 1.1085, + "step": 1323 + }, + { + "epoch": 1.002081362346263, + "grad_norm": 2.135875701904297, + "learning_rate": 1.5209882464538817e-05, + "loss": 1.1754, + "step": 1324 + }, + { + "epoch": 1.0028382213812677, + "grad_norm": 2.4602649211883545, + "learning_rate": 1.5203028311118778e-05, + "loss": 1.1202, + "step": 1325 + }, + { + "epoch": 1.0035950804162725, + "grad_norm": 2.5135326385498047, + "learning_rate": 1.5196170804550618e-05, + "loss": 1.147, + "step": 1326 + }, + { + "epoch": 1.0043519394512772, + "grad_norm": 2.3376166820526123, + "learning_rate": 1.518930994925459e-05, + "loss": 1.1715, + "step": 1327 + }, + { + "epoch": 1.0051087984862819, + "grad_norm": 2.2913684844970703, + "learning_rate": 1.518244574965309e-05, + "loss": 1.2037, + "step": 1328 + }, + { + "epoch": 1.0058656575212868, + "grad_norm": 2.34970760345459, + "learning_rate": 1.5175578210170678e-05, + "loss": 1.155, + "step": 1329 + }, + { + "epoch": 1.0066225165562914, + "grad_norm": 2.345945119857788, + "learning_rate": 1.5168707335234067e-05, + "loss": 1.1392, + "step": 1330 + }, + { + "epoch": 1.007379375591296, + "grad_norm": 2.2599105834960938, + "learning_rate": 1.5161833129272117e-05, + "loss": 1.1402, + "step": 1331 + }, + { + "epoch": 1.0081362346263008, + "grad_norm": 2.5355637073516846, + "learning_rate": 1.5154955596715836e-05, + "loss": 1.2212, + "step": 1332 + }, + { + "epoch": 1.0088930936613056, + "grad_norm": 2.4323315620422363, + "learning_rate": 1.5148074741998377e-05, + "loss": 1.1289, + "step": 1333 + }, + { + "epoch": 1.0096499526963103, + "grad_norm": 2.232952833175659, + "learning_rate": 1.5141190569555033e-05, + "loss": 1.1044, + "step": 1334 + }, + { + "epoch": 1.010406811731315, + "grad_norm": 2.1613996028900146, + "learning_rate": 1.513430308382324e-05, + "loss": 1.1678, + "step": 1335 + }, + { + "epoch": 1.0111636707663199, + "grad_norm": 2.369002342224121, + "learning_rate": 1.5127412289242562e-05, + "loss": 1.2219, + "step": 1336 + }, + { + "epoch": 1.0119205298013245, + "grad_norm": 2.2522876262664795, + "learning_rate": 1.51205181902547e-05, + "loss": 1.1728, + "step": 1337 + }, + { + "epoch": 1.0126773888363292, + "grad_norm": 2.172529935836792, + "learning_rate": 1.5113620791303489e-05, + "loss": 1.1599, + "step": 1338 + }, + { + "epoch": 1.013434247871334, + "grad_norm": 2.265456199645996, + "learning_rate": 1.5106720096834885e-05, + "loss": 1.1496, + "step": 1339 + }, + { + "epoch": 1.0141911069063387, + "grad_norm": 2.3640429973602295, + "learning_rate": 1.5099816111296968e-05, + "loss": 1.2001, + "step": 1340 + }, + { + "epoch": 1.0149479659413434, + "grad_norm": 2.227107286453247, + "learning_rate": 1.5092908839139948e-05, + "loss": 1.1911, + "step": 1341 + }, + { + "epoch": 1.015704824976348, + "grad_norm": 2.3177998065948486, + "learning_rate": 1.5085998284816144e-05, + "loss": 1.1233, + "step": 1342 + }, + { + "epoch": 1.016461684011353, + "grad_norm": 2.167343854904175, + "learning_rate": 1.507908445277999e-05, + "loss": 1.2057, + "step": 1343 + }, + { + "epoch": 1.0172185430463576, + "grad_norm": 2.2151575088500977, + "learning_rate": 1.5072167347488042e-05, + "loss": 1.1828, + "step": 1344 + }, + { + "epoch": 1.0179754020813623, + "grad_norm": 2.031900405883789, + "learning_rate": 1.5065246973398959e-05, + "loss": 1.1408, + "step": 1345 + }, + { + "epoch": 1.0187322611163672, + "grad_norm": 2.3186428546905518, + "learning_rate": 1.5058323334973508e-05, + "loss": 1.1698, + "step": 1346 + }, + { + "epoch": 1.0194891201513718, + "grad_norm": 2.2243926525115967, + "learning_rate": 1.5051396436674562e-05, + "loss": 1.1983, + "step": 1347 + }, + { + "epoch": 1.0202459791863765, + "grad_norm": 2.0334129333496094, + "learning_rate": 1.5044466282967092e-05, + "loss": 1.1257, + "step": 1348 + }, + { + "epoch": 1.0210028382213812, + "grad_norm": 2.194042921066284, + "learning_rate": 1.503753287831817e-05, + "loss": 1.224, + "step": 1349 + }, + { + "epoch": 1.021759697256386, + "grad_norm": 2.2667534351348877, + "learning_rate": 1.5030596227196963e-05, + "loss": 1.2042, + "step": 1350 + }, + { + "epoch": 1.0225165562913907, + "grad_norm": 2.2039318084716797, + "learning_rate": 1.5023656334074732e-05, + "loss": 1.1965, + "step": 1351 + }, + { + "epoch": 1.0232734153263954, + "grad_norm": 2.3508946895599365, + "learning_rate": 1.5016713203424824e-05, + "loss": 1.1718, + "step": 1352 + }, + { + "epoch": 1.0240302743614003, + "grad_norm": 2.135310649871826, + "learning_rate": 1.5009766839722679e-05, + "loss": 1.1503, + "step": 1353 + }, + { + "epoch": 1.024787133396405, + "grad_norm": 2.2958900928497314, + "learning_rate": 1.5002817247445813e-05, + "loss": 1.2141, + "step": 1354 + }, + { + "epoch": 1.0255439924314096, + "grad_norm": 2.3174233436584473, + "learning_rate": 1.4995864431073828e-05, + "loss": 1.158, + "step": 1355 + }, + { + "epoch": 1.0263008514664145, + "grad_norm": 2.1523966789245605, + "learning_rate": 1.4988908395088405e-05, + "loss": 1.1757, + "step": 1356 + }, + { + "epoch": 1.0270577105014191, + "grad_norm": 2.2384963035583496, + "learning_rate": 1.4981949143973297e-05, + "loss": 1.1391, + "step": 1357 + }, + { + "epoch": 1.0278145695364238, + "grad_norm": 2.1168923377990723, + "learning_rate": 1.4974986682214332e-05, + "loss": 1.1306, + "step": 1358 + }, + { + "epoch": 1.0285714285714285, + "grad_norm": 2.392561435699463, + "learning_rate": 1.4968021014299409e-05, + "loss": 1.2224, + "step": 1359 + }, + { + "epoch": 1.0293282876064334, + "grad_norm": 2.2257487773895264, + "learning_rate": 1.4961052144718486e-05, + "loss": 1.1284, + "step": 1360 + }, + { + "epoch": 1.030085146641438, + "grad_norm": 2.0494847297668457, + "learning_rate": 1.4954080077963596e-05, + "loss": 1.1204, + "step": 1361 + }, + { + "epoch": 1.0308420056764427, + "grad_norm": 2.2043280601501465, + "learning_rate": 1.4947104818528822e-05, + "loss": 1.2135, + "step": 1362 + }, + { + "epoch": 1.0315988647114476, + "grad_norm": 2.1744041442871094, + "learning_rate": 1.494012637091031e-05, + "loss": 1.1533, + "step": 1363 + }, + { + "epoch": 1.0323557237464522, + "grad_norm": 2.1696369647979736, + "learning_rate": 1.4933144739606262e-05, + "loss": 1.171, + "step": 1364 + }, + { + "epoch": 1.033112582781457, + "grad_norm": 2.259871006011963, + "learning_rate": 1.4926159929116934e-05, + "loss": 1.1689, + "step": 1365 + }, + { + "epoch": 1.0338694418164618, + "grad_norm": 2.3883163928985596, + "learning_rate": 1.4919171943944628e-05, + "loss": 1.1808, + "step": 1366 + }, + { + "epoch": 1.0346263008514665, + "grad_norm": 2.4137635231018066, + "learning_rate": 1.4912180788593686e-05, + "loss": 1.2425, + "step": 1367 + }, + { + "epoch": 1.0353831598864711, + "grad_norm": 2.2681548595428467, + "learning_rate": 1.4905186467570509e-05, + "loss": 1.1808, + "step": 1368 + }, + { + "epoch": 1.0361400189214758, + "grad_norm": 2.2818410396575928, + "learning_rate": 1.4898188985383522e-05, + "loss": 1.1897, + "step": 1369 + }, + { + "epoch": 1.0368968779564807, + "grad_norm": 2.2151215076446533, + "learning_rate": 1.4891188346543201e-05, + "loss": 1.136, + "step": 1370 + }, + { + "epoch": 1.0376537369914853, + "grad_norm": 2.248666763305664, + "learning_rate": 1.488418455556205e-05, + "loss": 1.1758, + "step": 1371 + }, + { + "epoch": 1.03841059602649, + "grad_norm": 2.2258033752441406, + "learning_rate": 1.4877177616954602e-05, + "loss": 1.1628, + "step": 1372 + }, + { + "epoch": 1.0391674550614949, + "grad_norm": 2.117659091949463, + "learning_rate": 1.4870167535237428e-05, + "loss": 1.2191, + "step": 1373 + }, + { + "epoch": 1.0399243140964995, + "grad_norm": 2.367983102798462, + "learning_rate": 1.4863154314929114e-05, + "loss": 1.1415, + "step": 1374 + }, + { + "epoch": 1.0406811731315042, + "grad_norm": 2.134035587310791, + "learning_rate": 1.4856137960550278e-05, + "loss": 1.2032, + "step": 1375 + }, + { + "epoch": 1.0414380321665089, + "grad_norm": 2.392430543899536, + "learning_rate": 1.4849118476623556e-05, + "loss": 1.197, + "step": 1376 + }, + { + "epoch": 1.0421948912015138, + "grad_norm": 2.237036943435669, + "learning_rate": 1.4842095867673603e-05, + "loss": 1.168, + "step": 1377 + }, + { + "epoch": 1.0429517502365184, + "grad_norm": 2.338472604751587, + "learning_rate": 1.4835070138227077e-05, + "loss": 1.1629, + "step": 1378 + }, + { + "epoch": 1.043708609271523, + "grad_norm": 2.1505656242370605, + "learning_rate": 1.4828041292812662e-05, + "loss": 1.1559, + "step": 1379 + }, + { + "epoch": 1.044465468306528, + "grad_norm": 2.155229330062866, + "learning_rate": 1.4821009335961045e-05, + "loss": 1.1477, + "step": 1380 + }, + { + "epoch": 1.0452223273415326, + "grad_norm": 2.1831212043762207, + "learning_rate": 1.4813974272204918e-05, + "loss": 1.1486, + "step": 1381 + }, + { + "epoch": 1.0459791863765373, + "grad_norm": 2.2904438972473145, + "learning_rate": 1.4806936106078971e-05, + "loss": 1.1605, + "step": 1382 + }, + { + "epoch": 1.0467360454115422, + "grad_norm": 2.416222333908081, + "learning_rate": 1.4799894842119906e-05, + "loss": 1.1161, + "step": 1383 + }, + { + "epoch": 1.0474929044465469, + "grad_norm": 2.2631683349609375, + "learning_rate": 1.4792850484866408e-05, + "loss": 1.173, + "step": 1384 + }, + { + "epoch": 1.0482497634815515, + "grad_norm": 2.2983131408691406, + "learning_rate": 1.4785803038859166e-05, + "loss": 1.1584, + "step": 1385 + }, + { + "epoch": 1.0490066225165562, + "grad_norm": 2.1680402755737305, + "learning_rate": 1.4778752508640852e-05, + "loss": 1.1689, + "step": 1386 + }, + { + "epoch": 1.049763481551561, + "grad_norm": 2.161684036254883, + "learning_rate": 1.4771698898756137e-05, + "loss": 1.1772, + "step": 1387 + }, + { + "epoch": 1.0505203405865657, + "grad_norm": 2.048295021057129, + "learning_rate": 1.4764642213751664e-05, + "loss": 1.1598, + "step": 1388 + }, + { + "epoch": 1.0512771996215704, + "grad_norm": 2.0943684577941895, + "learning_rate": 1.4757582458176067e-05, + "loss": 1.1389, + "step": 1389 + }, + { + "epoch": 1.0520340586565753, + "grad_norm": 2.5327534675598145, + "learning_rate": 1.475051963657996e-05, + "loss": 1.1496, + "step": 1390 + }, + { + "epoch": 1.05279091769158, + "grad_norm": 2.1597166061401367, + "learning_rate": 1.4743453753515924e-05, + "loss": 1.1409, + "step": 1391 + }, + { + "epoch": 1.0535477767265846, + "grad_norm": 2.877094268798828, + "learning_rate": 1.4736384813538527e-05, + "loss": 1.1359, + "step": 1392 + }, + { + "epoch": 1.0543046357615895, + "grad_norm": 2.4092910289764404, + "learning_rate": 1.472931282120429e-05, + "loss": 1.1673, + "step": 1393 + }, + { + "epoch": 1.0550614947965942, + "grad_norm": 2.26458740234375, + "learning_rate": 1.4722237781071717e-05, + "loss": 1.167, + "step": 1394 + }, + { + "epoch": 1.0558183538315988, + "grad_norm": 2.2418289184570312, + "learning_rate": 1.4715159697701276e-05, + "loss": 1.1674, + "step": 1395 + }, + { + "epoch": 1.0565752128666035, + "grad_norm": 2.948460102081299, + "learning_rate": 1.470807857565538e-05, + "loss": 1.1459, + "step": 1396 + }, + { + "epoch": 1.0573320719016084, + "grad_norm": 2.1875085830688477, + "learning_rate": 1.4700994419498423e-05, + "loss": 1.1781, + "step": 1397 + }, + { + "epoch": 1.058088930936613, + "grad_norm": 2.3909361362457275, + "learning_rate": 1.4693907233796737e-05, + "loss": 1.1346, + "step": 1398 + }, + { + "epoch": 1.0588457899716177, + "grad_norm": 2.12752628326416, + "learning_rate": 1.4686817023118619e-05, + "loss": 1.1127, + "step": 1399 + }, + { + "epoch": 1.0596026490066226, + "grad_norm": 2.3758580684661865, + "learning_rate": 1.4679723792034304e-05, + "loss": 1.1667, + "step": 1400 + }, + { + "epoch": 1.0603595080416273, + "grad_norm": 2.23144793510437, + "learning_rate": 1.4672627545115991e-05, + "loss": 1.1693, + "step": 1401 + }, + { + "epoch": 1.061116367076632, + "grad_norm": 2.2588181495666504, + "learning_rate": 1.46655282869378e-05, + "loss": 1.1555, + "step": 1402 + }, + { + "epoch": 1.0618732261116368, + "grad_norm": 3.266263961791992, + "learning_rate": 1.4658426022075816e-05, + "loss": 1.1475, + "step": 1403 + }, + { + "epoch": 1.0626300851466415, + "grad_norm": 2.344022750854492, + "learning_rate": 1.4651320755108042e-05, + "loss": 1.1151, + "step": 1404 + }, + { + "epoch": 1.0633869441816461, + "grad_norm": 2.701164722442627, + "learning_rate": 1.464421249061443e-05, + "loss": 1.1547, + "step": 1405 + }, + { + "epoch": 1.0641438032166508, + "grad_norm": 2.4534714221954346, + "learning_rate": 1.4637101233176856e-05, + "loss": 1.17, + "step": 1406 + }, + { + "epoch": 1.0649006622516557, + "grad_norm": 2.413388252258301, + "learning_rate": 1.462998698737913e-05, + "loss": 1.1852, + "step": 1407 + }, + { + "epoch": 1.0656575212866604, + "grad_norm": 2.2986247539520264, + "learning_rate": 1.4622869757806983e-05, + "loss": 1.1544, + "step": 1408 + }, + { + "epoch": 1.066414380321665, + "grad_norm": 2.308239221572876, + "learning_rate": 1.4615749549048076e-05, + "loss": 1.1572, + "step": 1409 + }, + { + "epoch": 1.06717123935667, + "grad_norm": 2.270495653152466, + "learning_rate": 1.4608626365691986e-05, + "loss": 1.1625, + "step": 1410 + }, + { + "epoch": 1.0679280983916746, + "grad_norm": 2.0776920318603516, + "learning_rate": 1.4601500212330213e-05, + "loss": 1.1879, + "step": 1411 + }, + { + "epoch": 1.0686849574266792, + "grad_norm": 2.279533863067627, + "learning_rate": 1.4594371093556159e-05, + "loss": 1.1844, + "step": 1412 + }, + { + "epoch": 1.069441816461684, + "grad_norm": 2.263552188873291, + "learning_rate": 1.4587239013965149e-05, + "loss": 1.1192, + "step": 1413 + }, + { + "epoch": 1.0701986754966888, + "grad_norm": 2.1875579357147217, + "learning_rate": 1.4580103978154414e-05, + "loss": 1.1921, + "step": 1414 + }, + { + "epoch": 1.0709555345316935, + "grad_norm": 2.553298234939575, + "learning_rate": 1.4572965990723083e-05, + "loss": 1.1307, + "step": 1415 + }, + { + "epoch": 1.0717123935666981, + "grad_norm": 2.0610175132751465, + "learning_rate": 1.4565825056272199e-05, + "loss": 1.2057, + "step": 1416 + }, + { + "epoch": 1.072469252601703, + "grad_norm": 3.3085532188415527, + "learning_rate": 1.4558681179404704e-05, + "loss": 1.1566, + "step": 1417 + }, + { + "epoch": 1.0732261116367077, + "grad_norm": 2.1448001861572266, + "learning_rate": 1.4551534364725422e-05, + "loss": 1.1958, + "step": 1418 + }, + { + "epoch": 1.0739829706717123, + "grad_norm": 2.5602312088012695, + "learning_rate": 1.4544384616841084e-05, + "loss": 1.1513, + "step": 1419 + }, + { + "epoch": 1.0747398297067172, + "grad_norm": 2.34245228767395, + "learning_rate": 1.4537231940360315e-05, + "loss": 1.1331, + "step": 1420 + }, + { + "epoch": 1.0754966887417219, + "grad_norm": 2.5702669620513916, + "learning_rate": 1.4530076339893615e-05, + "loss": 1.1369, + "step": 1421 + }, + { + "epoch": 1.0762535477767265, + "grad_norm": 2.3624837398529053, + "learning_rate": 1.4522917820053375e-05, + "loss": 1.1618, + "step": 1422 + }, + { + "epoch": 1.0770104068117312, + "grad_norm": 2.3341963291168213, + "learning_rate": 1.4515756385453868e-05, + "loss": 1.1688, + "step": 1423 + }, + { + "epoch": 1.077767265846736, + "grad_norm": 2.32336688041687, + "learning_rate": 1.4508592040711246e-05, + "loss": 1.136, + "step": 1424 + }, + { + "epoch": 1.0785241248817408, + "grad_norm": 2.4956133365631104, + "learning_rate": 1.4501424790443544e-05, + "loss": 1.1893, + "step": 1425 + }, + { + "epoch": 1.0792809839167454, + "grad_norm": 2.5766842365264893, + "learning_rate": 1.4494254639270646e-05, + "loss": 1.104, + "step": 1426 + }, + { + "epoch": 1.0800378429517503, + "grad_norm": 2.3494839668273926, + "learning_rate": 1.4487081591814336e-05, + "loss": 1.1509, + "step": 1427 + }, + { + "epoch": 1.080794701986755, + "grad_norm": 2.25639009475708, + "learning_rate": 1.4479905652698248e-05, + "loss": 1.1518, + "step": 1428 + }, + { + "epoch": 1.0815515610217596, + "grad_norm": 2.454833984375, + "learning_rate": 1.4472726826547876e-05, + "loss": 1.2164, + "step": 1429 + }, + { + "epoch": 1.0823084200567645, + "grad_norm": 2.320312976837158, + "learning_rate": 1.4465545117990587e-05, + "loss": 1.1562, + "step": 1430 + }, + { + "epoch": 1.0830652790917692, + "grad_norm": 2.136070966720581, + "learning_rate": 1.4458360531655606e-05, + "loss": 1.1621, + "step": 1431 + }, + { + "epoch": 1.0838221381267739, + "grad_norm": 2.3895716667175293, + "learning_rate": 1.4451173072173996e-05, + "loss": 1.1442, + "step": 1432 + }, + { + "epoch": 1.0845789971617785, + "grad_norm": 2.185600757598877, + "learning_rate": 1.4443982744178694e-05, + "loss": 1.1352, + "step": 1433 + }, + { + "epoch": 1.0853358561967834, + "grad_norm": 2.2408831119537354, + "learning_rate": 1.4436789552304471e-05, + "loss": 1.1771, + "step": 1434 + }, + { + "epoch": 1.086092715231788, + "grad_norm": 2.2491331100463867, + "learning_rate": 1.4429593501187952e-05, + "loss": 1.1509, + "step": 1435 + }, + { + "epoch": 1.0868495742667927, + "grad_norm": 2.4954214096069336, + "learning_rate": 1.4422394595467597e-05, + "loss": 1.1333, + "step": 1436 + }, + { + "epoch": 1.0876064333017976, + "grad_norm": 2.427107572555542, + "learning_rate": 1.4415192839783716e-05, + "loss": 1.1806, + "step": 1437 + }, + { + "epoch": 1.0883632923368023, + "grad_norm": 5.063598155975342, + "learning_rate": 1.4407988238778448e-05, + "loss": 1.1364, + "step": 1438 + }, + { + "epoch": 1.089120151371807, + "grad_norm": 2.261101245880127, + "learning_rate": 1.4400780797095769e-05, + "loss": 1.171, + "step": 1439 + }, + { + "epoch": 1.0898770104068118, + "grad_norm": 2.3400819301605225, + "learning_rate": 1.4393570519381484e-05, + "loss": 1.1354, + "step": 1440 + }, + { + "epoch": 1.0906338694418165, + "grad_norm": 2.225931167602539, + "learning_rate": 1.438635741028323e-05, + "loss": 1.1674, + "step": 1441 + }, + { + "epoch": 1.0913907284768212, + "grad_norm": 2.3904130458831787, + "learning_rate": 1.437914147445047e-05, + "loss": 1.1513, + "step": 1442 + }, + { + "epoch": 1.0921475875118258, + "grad_norm": 2.4583778381347656, + "learning_rate": 1.4371922716534483e-05, + "loss": 1.1708, + "step": 1443 + }, + { + "epoch": 1.0929044465468307, + "grad_norm": 2.270364761352539, + "learning_rate": 1.436470114118837e-05, + "loss": 1.1708, + "step": 1444 + }, + { + "epoch": 1.0936613055818354, + "grad_norm": 2.190642833709717, + "learning_rate": 1.4357476753067053e-05, + "loss": 1.123, + "step": 1445 + }, + { + "epoch": 1.09441816461684, + "grad_norm": 2.3521080017089844, + "learning_rate": 1.4350249556827256e-05, + "loss": 1.1793, + "step": 1446 + }, + { + "epoch": 1.095175023651845, + "grad_norm": 2.4334235191345215, + "learning_rate": 1.4343019557127522e-05, + "loss": 1.175, + "step": 1447 + }, + { + "epoch": 1.0959318826868496, + "grad_norm": 2.2919211387634277, + "learning_rate": 1.4335786758628199e-05, + "loss": 1.1601, + "step": 1448 + }, + { + "epoch": 1.0966887417218543, + "grad_norm": 2.603358745574951, + "learning_rate": 1.4328551165991435e-05, + "loss": 1.1966, + "step": 1449 + }, + { + "epoch": 1.097445600756859, + "grad_norm": 2.47110915184021, + "learning_rate": 1.4321312783881192e-05, + "loss": 1.1623, + "step": 1450 + }, + { + "epoch": 1.0982024597918638, + "grad_norm": 2.54114031791687, + "learning_rate": 1.431407161696321e-05, + "loss": 1.1598, + "step": 1451 + }, + { + "epoch": 1.0989593188268685, + "grad_norm": 2.1958112716674805, + "learning_rate": 1.4306827669905041e-05, + "loss": 1.1317, + "step": 1452 + }, + { + "epoch": 1.0997161778618731, + "grad_norm": 2.2989349365234375, + "learning_rate": 1.4299580947376022e-05, + "loss": 1.1036, + "step": 1453 + }, + { + "epoch": 1.100473036896878, + "grad_norm": 2.331904411315918, + "learning_rate": 1.4292331454047278e-05, + "loss": 1.1331, + "step": 1454 + }, + { + "epoch": 1.1012298959318827, + "grad_norm": 2.376122236251831, + "learning_rate": 1.4285079194591722e-05, + "loss": 1.1649, + "step": 1455 + }, + { + "epoch": 1.1019867549668874, + "grad_norm": 2.453084945678711, + "learning_rate": 1.4277824173684056e-05, + "loss": 1.1636, + "step": 1456 + }, + { + "epoch": 1.1027436140018922, + "grad_norm": 2.4421586990356445, + "learning_rate": 1.4270566396000744e-05, + "loss": 1.1323, + "step": 1457 + }, + { + "epoch": 1.103500473036897, + "grad_norm": 2.308035135269165, + "learning_rate": 1.426330586622005e-05, + "loss": 1.13, + "step": 1458 + }, + { + "epoch": 1.1042573320719016, + "grad_norm": 2.403162956237793, + "learning_rate": 1.4256042589021994e-05, + "loss": 1.181, + "step": 1459 + }, + { + "epoch": 1.1050141911069062, + "grad_norm": 2.4109246730804443, + "learning_rate": 1.4248776569088377e-05, + "loss": 1.1597, + "step": 1460 + }, + { + "epoch": 1.1057710501419111, + "grad_norm": 2.412398099899292, + "learning_rate": 1.4241507811102762e-05, + "loss": 1.118, + "step": 1461 + }, + { + "epoch": 1.1065279091769158, + "grad_norm": 2.8465075492858887, + "learning_rate": 1.4234236319750482e-05, + "loss": 1.1618, + "step": 1462 + }, + { + "epoch": 1.1072847682119205, + "grad_norm": 2.306621789932251, + "learning_rate": 1.4226962099718628e-05, + "loss": 1.2062, + "step": 1463 + }, + { + "epoch": 1.1080416272469253, + "grad_norm": 2.6074671745300293, + "learning_rate": 1.4219685155696053e-05, + "loss": 1.1676, + "step": 1464 + }, + { + "epoch": 1.10879848628193, + "grad_norm": 2.509995460510254, + "learning_rate": 1.421240549237336e-05, + "loss": 1.1771, + "step": 1465 + }, + { + "epoch": 1.1095553453169347, + "grad_norm": 2.535238742828369, + "learning_rate": 1.4205123114442916e-05, + "loss": 1.1682, + "step": 1466 + }, + { + "epoch": 1.1103122043519393, + "grad_norm": 2.4258975982666016, + "learning_rate": 1.4197838026598826e-05, + "loss": 1.0947, + "step": 1467 + }, + { + "epoch": 1.1110690633869442, + "grad_norm": 2.5997817516326904, + "learning_rate": 1.4190550233536946e-05, + "loss": 1.1471, + "step": 1468 + }, + { + "epoch": 1.1118259224219489, + "grad_norm": 2.358372449874878, + "learning_rate": 1.4183259739954877e-05, + "loss": 1.1564, + "step": 1469 + }, + { + "epoch": 1.1125827814569536, + "grad_norm": 2.323791027069092, + "learning_rate": 1.4175966550551963e-05, + "loss": 1.1936, + "step": 1470 + }, + { + "epoch": 1.1133396404919584, + "grad_norm": 2.334627151489258, + "learning_rate": 1.4168670670029277e-05, + "loss": 1.1514, + "step": 1471 + }, + { + "epoch": 1.114096499526963, + "grad_norm": 2.2344837188720703, + "learning_rate": 1.4161372103089637e-05, + "loss": 1.1378, + "step": 1472 + }, + { + "epoch": 1.1148533585619678, + "grad_norm": 2.200742483139038, + "learning_rate": 1.4154070854437587e-05, + "loss": 1.1783, + "step": 1473 + }, + { + "epoch": 1.1156102175969727, + "grad_norm": 2.2466723918914795, + "learning_rate": 1.4146766928779396e-05, + "loss": 1.1419, + "step": 1474 + }, + { + "epoch": 1.1163670766319773, + "grad_norm": 2.4173378944396973, + "learning_rate": 1.4139460330823071e-05, + "loss": 1.0991, + "step": 1475 + }, + { + "epoch": 1.117123935666982, + "grad_norm": 2.3149657249450684, + "learning_rate": 1.413215106527833e-05, + "loss": 1.1419, + "step": 1476 + }, + { + "epoch": 1.1178807947019869, + "grad_norm": 2.2564306259155273, + "learning_rate": 1.4124839136856612e-05, + "loss": 1.1693, + "step": 1477 + }, + { + "epoch": 1.1186376537369915, + "grad_norm": 2.131028652191162, + "learning_rate": 1.4117524550271077e-05, + "loss": 1.158, + "step": 1478 + }, + { + "epoch": 1.1193945127719962, + "grad_norm": 2.4710068702697754, + "learning_rate": 1.4110207310236595e-05, + "loss": 1.1934, + "step": 1479 + }, + { + "epoch": 1.1201513718070009, + "grad_norm": 2.138939380645752, + "learning_rate": 1.4102887421469747e-05, + "loss": 1.1196, + "step": 1480 + }, + { + "epoch": 1.1209082308420057, + "grad_norm": 2.542495012283325, + "learning_rate": 1.4095564888688822e-05, + "loss": 1.1693, + "step": 1481 + }, + { + "epoch": 1.1216650898770104, + "grad_norm": 2.4574832916259766, + "learning_rate": 1.4088239716613816e-05, + "loss": 1.1248, + "step": 1482 + }, + { + "epoch": 1.122421948912015, + "grad_norm": 2.133028268814087, + "learning_rate": 1.4080911909966419e-05, + "loss": 1.2361, + "step": 1483 + }, + { + "epoch": 1.12317880794702, + "grad_norm": 2.624393939971924, + "learning_rate": 1.4073581473470023e-05, + "loss": 1.1053, + "step": 1484 + }, + { + "epoch": 1.1239356669820246, + "grad_norm": 2.0480175018310547, + "learning_rate": 1.4066248411849717e-05, + "loss": 1.1364, + "step": 1485 + }, + { + "epoch": 1.1246925260170293, + "grad_norm": 2.2111339569091797, + "learning_rate": 1.4058912729832286e-05, + "loss": 1.1869, + "step": 1486 + }, + { + "epoch": 1.125449385052034, + "grad_norm": 2.4910013675689697, + "learning_rate": 1.4051574432146191e-05, + "loss": 1.16, + "step": 1487 + }, + { + "epoch": 1.1262062440870388, + "grad_norm": 2.419105052947998, + "learning_rate": 1.4044233523521587e-05, + "loss": 1.1637, + "step": 1488 + }, + { + "epoch": 1.1269631031220435, + "grad_norm": 2.4131598472595215, + "learning_rate": 1.4036890008690316e-05, + "loss": 1.1814, + "step": 1489 + }, + { + "epoch": 1.1277199621570482, + "grad_norm": 2.395854949951172, + "learning_rate": 1.4029543892385898e-05, + "loss": 1.1535, + "step": 1490 + }, + { + "epoch": 1.128476821192053, + "grad_norm": 2.0963070392608643, + "learning_rate": 1.4022195179343518e-05, + "loss": 1.1366, + "step": 1491 + }, + { + "epoch": 1.1292336802270577, + "grad_norm": 2.267829418182373, + "learning_rate": 1.4014843874300052e-05, + "loss": 1.1393, + "step": 1492 + }, + { + "epoch": 1.1299905392620624, + "grad_norm": 2.1519582271575928, + "learning_rate": 1.4007489981994038e-05, + "loss": 1.1728, + "step": 1493 + }, + { + "epoch": 1.1307473982970673, + "grad_norm": 2.194342613220215, + "learning_rate": 1.4000133507165684e-05, + "loss": 1.1586, + "step": 1494 + }, + { + "epoch": 1.131504257332072, + "grad_norm": 2.3476803302764893, + "learning_rate": 1.3992774454556855e-05, + "loss": 1.2297, + "step": 1495 + }, + { + "epoch": 1.1322611163670766, + "grad_norm": 2.1007235050201416, + "learning_rate": 1.3985412828911088e-05, + "loss": 1.1605, + "step": 1496 + }, + { + "epoch": 1.1330179754020813, + "grad_norm": 2.512786388397217, + "learning_rate": 1.397804863497358e-05, + "loss": 1.1765, + "step": 1497 + }, + { + "epoch": 1.1337748344370862, + "grad_norm": 2.1948659420013428, + "learning_rate": 1.397068187749117e-05, + "loss": 1.1912, + "step": 1498 + }, + { + "epoch": 1.1345316934720908, + "grad_norm": 2.348325729370117, + "learning_rate": 1.3963312561212359e-05, + "loss": 1.152, + "step": 1499 + }, + { + "epoch": 1.1352885525070955, + "grad_norm": 2.088045597076416, + "learning_rate": 1.3955940690887301e-05, + "loss": 1.0803, + "step": 1500 + }, + { + "epoch": 1.1360454115421004, + "grad_norm": 2.110816240310669, + "learning_rate": 1.3948566271267784e-05, + "loss": 1.1599, + "step": 1501 + }, + { + "epoch": 1.136802270577105, + "grad_norm": 2.306739330291748, + "learning_rate": 1.3941189307107255e-05, + "loss": 1.2, + "step": 1502 + }, + { + "epoch": 1.1375591296121097, + "grad_norm": 2.494978666305542, + "learning_rate": 1.3933809803160784e-05, + "loss": 1.1418, + "step": 1503 + }, + { + "epoch": 1.1383159886471144, + "grad_norm": 2.4510955810546875, + "learning_rate": 1.3926427764185093e-05, + "loss": 1.1455, + "step": 1504 + }, + { + "epoch": 1.1390728476821192, + "grad_norm": 2.415323495864868, + "learning_rate": 1.3919043194938528e-05, + "loss": 1.1361, + "step": 1505 + }, + { + "epoch": 1.139829706717124, + "grad_norm": 2.263831615447998, + "learning_rate": 1.391165610018107e-05, + "loss": 1.1087, + "step": 1506 + }, + { + "epoch": 1.1405865657521286, + "grad_norm": 2.5898752212524414, + "learning_rate": 1.3904266484674331e-05, + "loss": 1.1339, + "step": 1507 + }, + { + "epoch": 1.1413434247871335, + "grad_norm": 2.153635263442993, + "learning_rate": 1.3896874353181542e-05, + "loss": 1.1024, + "step": 1508 + }, + { + "epoch": 1.1421002838221381, + "grad_norm": 2.095327138900757, + "learning_rate": 1.3889479710467557e-05, + "loss": 1.2094, + "step": 1509 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 2.1070072650909424, + "learning_rate": 1.388208256129885e-05, + "loss": 1.143, + "step": 1510 + }, + { + "epoch": 1.1436140018921477, + "grad_norm": 2.2180447578430176, + "learning_rate": 1.3874682910443516e-05, + "loss": 1.1682, + "step": 1511 + }, + { + "epoch": 1.1443708609271523, + "grad_norm": 2.3871870040893555, + "learning_rate": 1.3867280762671246e-05, + "loss": 1.1671, + "step": 1512 + }, + { + "epoch": 1.145127719962157, + "grad_norm": 2.165802001953125, + "learning_rate": 1.3859876122753363e-05, + "loss": 1.1138, + "step": 1513 + }, + { + "epoch": 1.145884578997162, + "grad_norm": 2.162033796310425, + "learning_rate": 1.3852468995462785e-05, + "loss": 1.1719, + "step": 1514 + }, + { + "epoch": 1.1466414380321666, + "grad_norm": 2.163429021835327, + "learning_rate": 1.3845059385574023e-05, + "loss": 1.1483, + "step": 1515 + }, + { + "epoch": 1.1473982970671712, + "grad_norm": 2.177055597305298, + "learning_rate": 1.3837647297863203e-05, + "loss": 1.1372, + "step": 1516 + }, + { + "epoch": 1.1481551561021759, + "grad_norm": 2.3266491889953613, + "learning_rate": 1.383023273710805e-05, + "loss": 1.1825, + "step": 1517 + }, + { + "epoch": 1.1489120151371808, + "grad_norm": 2.4812657833099365, + "learning_rate": 1.3822815708087865e-05, + "loss": 1.1697, + "step": 1518 + }, + { + "epoch": 1.1496688741721854, + "grad_norm": 2.1462526321411133, + "learning_rate": 1.3815396215583564e-05, + "loss": 1.1203, + "step": 1519 + }, + { + "epoch": 1.15042573320719, + "grad_norm": 2.160487174987793, + "learning_rate": 1.3807974264377629e-05, + "loss": 1.1322, + "step": 1520 + }, + { + "epoch": 1.1511825922421948, + "grad_norm": 2.234320640563965, + "learning_rate": 1.3800549859254144e-05, + "loss": 1.1393, + "step": 1521 + }, + { + "epoch": 1.1519394512771997, + "grad_norm": 2.4396426677703857, + "learning_rate": 1.3793123004998765e-05, + "loss": 1.1671, + "step": 1522 + }, + { + "epoch": 1.1526963103122043, + "grad_norm": 2.3154118061065674, + "learning_rate": 1.3785693706398724e-05, + "loss": 1.1867, + "step": 1523 + }, + { + "epoch": 1.153453169347209, + "grad_norm": 2.2119319438934326, + "learning_rate": 1.377826196824284e-05, + "loss": 1.1582, + "step": 1524 + }, + { + "epoch": 1.1542100283822139, + "grad_norm": 2.2084405422210693, + "learning_rate": 1.3770827795321495e-05, + "loss": 1.1613, + "step": 1525 + }, + { + "epoch": 1.1549668874172185, + "grad_norm": 2.341912031173706, + "learning_rate": 1.3763391192426644e-05, + "loss": 1.1519, + "step": 1526 + }, + { + "epoch": 1.1557237464522232, + "grad_norm": 2.2736034393310547, + "learning_rate": 1.3755952164351814e-05, + "loss": 1.1465, + "step": 1527 + }, + { + "epoch": 1.156480605487228, + "grad_norm": 2.2359468936920166, + "learning_rate": 1.3748510715892075e-05, + "loss": 1.193, + "step": 1528 + }, + { + "epoch": 1.1572374645222328, + "grad_norm": 1.9430551528930664, + "learning_rate": 1.3741066851844082e-05, + "loss": 1.139, + "step": 1529 + }, + { + "epoch": 1.1579943235572374, + "grad_norm": 2.0962564945220947, + "learning_rate": 1.3733620577006035e-05, + "loss": 1.1442, + "step": 1530 + }, + { + "epoch": 1.1587511825922423, + "grad_norm": 2.0686581134796143, + "learning_rate": 1.3726171896177687e-05, + "loss": 1.1778, + "step": 1531 + }, + { + "epoch": 1.159508041627247, + "grad_norm": 2.120643138885498, + "learning_rate": 1.3718720814160342e-05, + "loss": 1.1789, + "step": 1532 + }, + { + "epoch": 1.1602649006622516, + "grad_norm": 2.4168310165405273, + "learning_rate": 1.3711267335756862e-05, + "loss": 1.1816, + "step": 1533 + }, + { + "epoch": 1.1610217596972563, + "grad_norm": 2.183661460876465, + "learning_rate": 1.3703811465771636e-05, + "loss": 1.1861, + "step": 1534 + }, + { + "epoch": 1.1617786187322612, + "grad_norm": 2.196077346801758, + "learning_rate": 1.3696353209010609e-05, + "loss": 1.1845, + "step": 1535 + }, + { + "epoch": 1.1625354777672658, + "grad_norm": 2.280958890914917, + "learning_rate": 1.3688892570281261e-05, + "loss": 1.1371, + "step": 1536 + }, + { + "epoch": 1.1632923368022705, + "grad_norm": 2.3048434257507324, + "learning_rate": 1.3681429554392602e-05, + "loss": 1.15, + "step": 1537 + }, + { + "epoch": 1.1640491958372754, + "grad_norm": 2.177098512649536, + "learning_rate": 1.367396416615518e-05, + "loss": 1.1537, + "step": 1538 + }, + { + "epoch": 1.16480605487228, + "grad_norm": 2.419185161590576, + "learning_rate": 1.3666496410381072e-05, + "loss": 1.1634, + "step": 1539 + }, + { + "epoch": 1.1655629139072847, + "grad_norm": 2.5214250087738037, + "learning_rate": 1.3659026291883874e-05, + "loss": 1.1669, + "step": 1540 + }, + { + "epoch": 1.1663197729422894, + "grad_norm": 2.3618457317352295, + "learning_rate": 1.365155381547872e-05, + "loss": 1.2169, + "step": 1541 + }, + { + "epoch": 1.1670766319772943, + "grad_norm": 2.3737759590148926, + "learning_rate": 1.3644078985982243e-05, + "loss": 1.2004, + "step": 1542 + }, + { + "epoch": 1.167833491012299, + "grad_norm": 2.500761032104492, + "learning_rate": 1.3636601808212613e-05, + "loss": 1.1576, + "step": 1543 + }, + { + "epoch": 1.1685903500473036, + "grad_norm": 2.429725170135498, + "learning_rate": 1.36291222869895e-05, + "loss": 1.1412, + "step": 1544 + }, + { + "epoch": 1.1693472090823085, + "grad_norm": 2.4820287227630615, + "learning_rate": 1.3621640427134095e-05, + "loss": 1.1256, + "step": 1545 + }, + { + "epoch": 1.1701040681173132, + "grad_norm": 2.5075745582580566, + "learning_rate": 1.3614156233469081e-05, + "loss": 1.1426, + "step": 1546 + }, + { + "epoch": 1.1708609271523178, + "grad_norm": 2.5569803714752197, + "learning_rate": 1.3606669710818665e-05, + "loss": 1.1438, + "step": 1547 + }, + { + "epoch": 1.1716177861873227, + "grad_norm": 2.4151296615600586, + "learning_rate": 1.3599180864008538e-05, + "loss": 1.1325, + "step": 1548 + }, + { + "epoch": 1.1723746452223274, + "grad_norm": 2.1855361461639404, + "learning_rate": 1.3591689697865902e-05, + "loss": 1.1328, + "step": 1549 + }, + { + "epoch": 1.173131504257332, + "grad_norm": 2.130683422088623, + "learning_rate": 1.3584196217219443e-05, + "loss": 1.1411, + "step": 1550 + }, + { + "epoch": 1.173888363292337, + "grad_norm": 2.3407418727874756, + "learning_rate": 1.357670042689935e-05, + "loss": 1.1396, + "step": 1551 + }, + { + "epoch": 1.1746452223273416, + "grad_norm": 2.5837795734405518, + "learning_rate": 1.3569202331737292e-05, + "loss": 1.1858, + "step": 1552 + }, + { + "epoch": 1.1754020813623463, + "grad_norm": 2.2978811264038086, + "learning_rate": 1.3561701936566426e-05, + "loss": 1.1712, + "step": 1553 + }, + { + "epoch": 1.176158940397351, + "grad_norm": 2.5813682079315186, + "learning_rate": 1.355419924622139e-05, + "loss": 1.1282, + "step": 1554 + }, + { + "epoch": 1.1769157994323558, + "grad_norm": 2.0672824382781982, + "learning_rate": 1.3546694265538316e-05, + "loss": 1.1639, + "step": 1555 + }, + { + "epoch": 1.1776726584673605, + "grad_norm": 2.138291358947754, + "learning_rate": 1.3539186999354785e-05, + "loss": 1.1583, + "step": 1556 + }, + { + "epoch": 1.1784295175023651, + "grad_norm": 2.2805378437042236, + "learning_rate": 1.3531677452509873e-05, + "loss": 1.1315, + "step": 1557 + }, + { + "epoch": 1.1791863765373698, + "grad_norm": 2.4369373321533203, + "learning_rate": 1.3524165629844124e-05, + "loss": 1.1395, + "step": 1558 + }, + { + "epoch": 1.1799432355723747, + "grad_norm": 2.610330820083618, + "learning_rate": 1.3516651536199536e-05, + "loss": 1.1534, + "step": 1559 + }, + { + "epoch": 1.1807000946073793, + "grad_norm": 2.1532680988311768, + "learning_rate": 1.3509135176419583e-05, + "loss": 1.1266, + "step": 1560 + }, + { + "epoch": 1.181456953642384, + "grad_norm": 2.269569158554077, + "learning_rate": 1.3501616555349195e-05, + "loss": 1.1962, + "step": 1561 + }, + { + "epoch": 1.182213812677389, + "grad_norm": 2.3179900646209717, + "learning_rate": 1.3494095677834762e-05, + "loss": 1.1554, + "step": 1562 + }, + { + "epoch": 1.1829706717123936, + "grad_norm": 2.116596221923828, + "learning_rate": 1.3486572548724126e-05, + "loss": 1.124, + "step": 1563 + }, + { + "epoch": 1.1837275307473982, + "grad_norm": 2.4712612628936768, + "learning_rate": 1.347904717286658e-05, + "loss": 1.1336, + "step": 1564 + }, + { + "epoch": 1.1844843897824031, + "grad_norm": 2.1904261112213135, + "learning_rate": 1.3471519555112866e-05, + "loss": 1.1613, + "step": 1565 + }, + { + "epoch": 1.1852412488174078, + "grad_norm": 2.235826253890991, + "learning_rate": 1.3463989700315179e-05, + "loss": 1.1404, + "step": 1566 + }, + { + "epoch": 1.1859981078524124, + "grad_norm": 2.2662901878356934, + "learning_rate": 1.3456457613327136e-05, + "loss": 1.1985, + "step": 1567 + }, + { + "epoch": 1.1867549668874173, + "grad_norm": 2.3666486740112305, + "learning_rate": 1.3448923299003815e-05, + "loss": 1.125, + "step": 1568 + }, + { + "epoch": 1.187511825922422, + "grad_norm": 2.1387600898742676, + "learning_rate": 1.344138676220172e-05, + "loss": 1.157, + "step": 1569 + }, + { + "epoch": 1.1882686849574267, + "grad_norm": 2.4021949768066406, + "learning_rate": 1.3433848007778783e-05, + "loss": 1.1628, + "step": 1570 + }, + { + "epoch": 1.1890255439924313, + "grad_norm": 2.100867986679077, + "learning_rate": 1.3426307040594372e-05, + "loss": 1.1712, + "step": 1571 + }, + { + "epoch": 1.1897824030274362, + "grad_norm": 2.6818182468414307, + "learning_rate": 1.3418763865509283e-05, + "loss": 1.1505, + "step": 1572 + }, + { + "epoch": 1.1905392620624409, + "grad_norm": 2.2335827350616455, + "learning_rate": 1.3411218487385725e-05, + "loss": 1.1367, + "step": 1573 + }, + { + "epoch": 1.1912961210974455, + "grad_norm": 2.332047939300537, + "learning_rate": 1.3403670911087339e-05, + "loss": 1.1186, + "step": 1574 + }, + { + "epoch": 1.1920529801324504, + "grad_norm": 2.119150400161743, + "learning_rate": 1.339612114147917e-05, + "loss": 1.1748, + "step": 1575 + }, + { + "epoch": 1.192809839167455, + "grad_norm": 2.413939952850342, + "learning_rate": 1.3388569183427695e-05, + "loss": 1.178, + "step": 1576 + }, + { + "epoch": 1.1935666982024598, + "grad_norm": 2.265653371810913, + "learning_rate": 1.3381015041800787e-05, + "loss": 1.1532, + "step": 1577 + }, + { + "epoch": 1.1943235572374644, + "grad_norm": 2.1941394805908203, + "learning_rate": 1.3373458721467724e-05, + "loss": 1.1027, + "step": 1578 + }, + { + "epoch": 1.1950804162724693, + "grad_norm": 2.350780725479126, + "learning_rate": 1.3365900227299205e-05, + "loss": 1.1373, + "step": 1579 + }, + { + "epoch": 1.195837275307474, + "grad_norm": 2.401061773300171, + "learning_rate": 1.3358339564167313e-05, + "loss": 1.1602, + "step": 1580 + }, + { + "epoch": 1.1965941343424786, + "grad_norm": 2.3053834438323975, + "learning_rate": 1.3350776736945539e-05, + "loss": 1.0973, + "step": 1581 + }, + { + "epoch": 1.1973509933774835, + "grad_norm": 2.3348872661590576, + "learning_rate": 1.3343211750508769e-05, + "loss": 1.1439, + "step": 1582 + }, + { + "epoch": 1.1981078524124882, + "grad_norm": 2.255254030227661, + "learning_rate": 1.333564460973327e-05, + "loss": 1.1259, + "step": 1583 + }, + { + "epoch": 1.1988647114474928, + "grad_norm": 2.1117663383483887, + "learning_rate": 1.332807531949671e-05, + "loss": 1.1075, + "step": 1584 + }, + { + "epoch": 1.1996215704824977, + "grad_norm": 2.2909741401672363, + "learning_rate": 1.3320503884678141e-05, + "loss": 1.1518, + "step": 1585 + }, + { + "epoch": 1.2003784295175024, + "grad_norm": 2.2066426277160645, + "learning_rate": 1.331293031015799e-05, + "loss": 1.1617, + "step": 1586 + }, + { + "epoch": 1.201135288552507, + "grad_norm": 2.2523305416107178, + "learning_rate": 1.3305354600818068e-05, + "loss": 1.1961, + "step": 1587 + }, + { + "epoch": 1.201892147587512, + "grad_norm": 2.1978890895843506, + "learning_rate": 1.3297776761541566e-05, + "loss": 1.154, + "step": 1588 + }, + { + "epoch": 1.2026490066225166, + "grad_norm": 2.333961009979248, + "learning_rate": 1.3290196797213037e-05, + "loss": 1.1201, + "step": 1589 + }, + { + "epoch": 1.2034058656575213, + "grad_norm": 2.1890499591827393, + "learning_rate": 1.3282614712718412e-05, + "loss": 1.1166, + "step": 1590 + }, + { + "epoch": 1.204162724692526, + "grad_norm": 2.2715249061584473, + "learning_rate": 1.3275030512944995e-05, + "loss": 1.1702, + "step": 1591 + }, + { + "epoch": 1.2049195837275308, + "grad_norm": 2.378854513168335, + "learning_rate": 1.3267444202781434e-05, + "loss": 1.1674, + "step": 1592 + }, + { + "epoch": 1.2056764427625355, + "grad_norm": 2.085010290145874, + "learning_rate": 1.3259855787117758e-05, + "loss": 1.1709, + "step": 1593 + }, + { + "epoch": 1.2064333017975402, + "grad_norm": 2.3028149604797363, + "learning_rate": 1.3252265270845339e-05, + "loss": 1.1304, + "step": 1594 + }, + { + "epoch": 1.2071901608325448, + "grad_norm": 2.0950684547424316, + "learning_rate": 1.3244672658856908e-05, + "loss": 1.1585, + "step": 1595 + }, + { + "epoch": 1.2079470198675497, + "grad_norm": 2.2300803661346436, + "learning_rate": 1.3237077956046551e-05, + "loss": 1.1123, + "step": 1596 + }, + { + "epoch": 1.2087038789025544, + "grad_norm": 2.1364376544952393, + "learning_rate": 1.3229481167309692e-05, + "loss": 1.1112, + "step": 1597 + }, + { + "epoch": 1.209460737937559, + "grad_norm": 2.9876246452331543, + "learning_rate": 1.322188229754311e-05, + "loss": 1.1989, + "step": 1598 + }, + { + "epoch": 1.210217596972564, + "grad_norm": 2.1434969902038574, + "learning_rate": 1.3214281351644918e-05, + "loss": 1.1665, + "step": 1599 + }, + { + "epoch": 1.2109744560075686, + "grad_norm": 2.142533779144287, + "learning_rate": 1.3206678334514571e-05, + "loss": 1.1229, + "step": 1600 + }, + { + "epoch": 1.2117313150425733, + "grad_norm": 2.065274715423584, + "learning_rate": 1.3199073251052854e-05, + "loss": 1.1167, + "step": 1601 + }, + { + "epoch": 1.2124881740775781, + "grad_norm": 2.128526449203491, + "learning_rate": 1.3191466106161893e-05, + "loss": 1.1622, + "step": 1602 + }, + { + "epoch": 1.2132450331125828, + "grad_norm": 2.075362205505371, + "learning_rate": 1.3183856904745135e-05, + "loss": 1.1541, + "step": 1603 + }, + { + "epoch": 1.2140018921475875, + "grad_norm": 2.4913156032562256, + "learning_rate": 1.3176245651707357e-05, + "loss": 1.1635, + "step": 1604 + }, + { + "epoch": 1.2147587511825924, + "grad_norm": 2.1509463787078857, + "learning_rate": 1.3168632351954653e-05, + "loss": 1.1317, + "step": 1605 + }, + { + "epoch": 1.215515610217597, + "grad_norm": 2.2484796047210693, + "learning_rate": 1.3161017010394444e-05, + "loss": 1.1342, + "step": 1606 + }, + { + "epoch": 1.2162724692526017, + "grad_norm": 2.2622358798980713, + "learning_rate": 1.3153399631935463e-05, + "loss": 1.1416, + "step": 1607 + }, + { + "epoch": 1.2170293282876063, + "grad_norm": 2.4243550300598145, + "learning_rate": 1.3145780221487754e-05, + "loss": 1.1653, + "step": 1608 + }, + { + "epoch": 1.2177861873226112, + "grad_norm": 2.211627960205078, + "learning_rate": 1.3138158783962668e-05, + "loss": 1.177, + "step": 1609 + }, + { + "epoch": 1.218543046357616, + "grad_norm": 2.025865316390991, + "learning_rate": 1.3130535324272884e-05, + "loss": 1.1536, + "step": 1610 + }, + { + "epoch": 1.2192999053926206, + "grad_norm": 2.297100782394409, + "learning_rate": 1.3122909847332349e-05, + "loss": 1.2091, + "step": 1611 + }, + { + "epoch": 1.2200567644276255, + "grad_norm": 2.41648006439209, + "learning_rate": 1.3115282358056333e-05, + "loss": 1.183, + "step": 1612 + }, + { + "epoch": 1.2208136234626301, + "grad_norm": 2.1309831142425537, + "learning_rate": 1.3107652861361408e-05, + "loss": 1.1715, + "step": 1613 + }, + { + "epoch": 1.2215704824976348, + "grad_norm": 2.268522262573242, + "learning_rate": 1.3100021362165426e-05, + "loss": 1.1762, + "step": 1614 + }, + { + "epoch": 1.2223273415326394, + "grad_norm": 2.296552896499634, + "learning_rate": 1.3092387865387533e-05, + "loss": 1.104, + "step": 1615 + }, + { + "epoch": 1.2230842005676443, + "grad_norm": 2.1397440433502197, + "learning_rate": 1.3084752375948166e-05, + "loss": 1.1284, + "step": 1616 + }, + { + "epoch": 1.223841059602649, + "grad_norm": 2.097498893737793, + "learning_rate": 1.3077114898769048e-05, + "loss": 1.124, + "step": 1617 + }, + { + "epoch": 1.2245979186376537, + "grad_norm": 2.212064027786255, + "learning_rate": 1.3069475438773178e-05, + "loss": 1.1184, + "step": 1618 + }, + { + "epoch": 1.2253547776726585, + "grad_norm": 2.323784351348877, + "learning_rate": 1.3061834000884831e-05, + "loss": 1.1615, + "step": 1619 + }, + { + "epoch": 1.2261116367076632, + "grad_norm": 2.1432077884674072, + "learning_rate": 1.3054190590029572e-05, + "loss": 1.1564, + "step": 1620 + }, + { + "epoch": 1.2268684957426679, + "grad_norm": 2.3040294647216797, + "learning_rate": 1.3046545211134218e-05, + "loss": 1.1227, + "step": 1621 + }, + { + "epoch": 1.2276253547776728, + "grad_norm": 2.406848669052124, + "learning_rate": 1.3038897869126865e-05, + "loss": 1.1577, + "step": 1622 + }, + { + "epoch": 1.2283822138126774, + "grad_norm": 2.3050808906555176, + "learning_rate": 1.3031248568936877e-05, + "loss": 1.1924, + "step": 1623 + }, + { + "epoch": 1.229139072847682, + "grad_norm": 2.03425669670105, + "learning_rate": 1.3023597315494874e-05, + "loss": 1.1474, + "step": 1624 + }, + { + "epoch": 1.2298959318826868, + "grad_norm": 2.2921745777130127, + "learning_rate": 1.3015944113732734e-05, + "loss": 1.1815, + "step": 1625 + }, + { + "epoch": 1.2306527909176916, + "grad_norm": 2.248823881149292, + "learning_rate": 1.3008288968583603e-05, + "loss": 1.1482, + "step": 1626 + }, + { + "epoch": 1.2314096499526963, + "grad_norm": 2.3645107746124268, + "learning_rate": 1.3000631884981858e-05, + "loss": 1.1383, + "step": 1627 + }, + { + "epoch": 1.232166508987701, + "grad_norm": 2.0877134799957275, + "learning_rate": 1.2992972867863147e-05, + "loss": 1.2064, + "step": 1628 + }, + { + "epoch": 1.2329233680227059, + "grad_norm": 2.3611538410186768, + "learning_rate": 1.2985311922164359e-05, + "loss": 1.1582, + "step": 1629 + }, + { + "epoch": 1.2336802270577105, + "grad_norm": 2.087958335876465, + "learning_rate": 1.2977649052823616e-05, + "loss": 1.1553, + "step": 1630 + }, + { + "epoch": 1.2344370860927152, + "grad_norm": 2.2635905742645264, + "learning_rate": 1.2969984264780283e-05, + "loss": 1.1704, + "step": 1631 + }, + { + "epoch": 1.2351939451277199, + "grad_norm": 2.251215934753418, + "learning_rate": 1.2962317562974976e-05, + "loss": 1.1855, + "step": 1632 + }, + { + "epoch": 1.2359508041627247, + "grad_norm": 3.073653221130371, + "learning_rate": 1.2954648952349527e-05, + "loss": 1.1935, + "step": 1633 + }, + { + "epoch": 1.2367076631977294, + "grad_norm": 2.2269108295440674, + "learning_rate": 1.2946978437847004e-05, + "loss": 1.146, + "step": 1634 + }, + { + "epoch": 1.237464522232734, + "grad_norm": 2.4930577278137207, + "learning_rate": 1.2939306024411713e-05, + "loss": 1.1703, + "step": 1635 + }, + { + "epoch": 1.238221381267739, + "grad_norm": 2.6076838970184326, + "learning_rate": 1.2931631716989166e-05, + "loss": 1.1756, + "step": 1636 + }, + { + "epoch": 1.2389782403027436, + "grad_norm": 2.250298023223877, + "learning_rate": 1.292395552052611e-05, + "loss": 1.1607, + "step": 1637 + }, + { + "epoch": 1.2397350993377483, + "grad_norm": 2.2543821334838867, + "learning_rate": 1.29162774399705e-05, + "loss": 1.1653, + "step": 1638 + }, + { + "epoch": 1.2404919583727532, + "grad_norm": 2.1660284996032715, + "learning_rate": 1.290859748027151e-05, + "loss": 1.0827, + "step": 1639 + }, + { + "epoch": 1.2412488174077578, + "grad_norm": 2.392023801803589, + "learning_rate": 1.2900915646379524e-05, + "loss": 1.1651, + "step": 1640 + }, + { + "epoch": 1.2420056764427625, + "grad_norm": 2.147473096847534, + "learning_rate": 1.2893231943246143e-05, + "loss": 1.1012, + "step": 1641 + }, + { + "epoch": 1.2427625354777674, + "grad_norm": 2.2261102199554443, + "learning_rate": 1.2885546375824154e-05, + "loss": 1.1313, + "step": 1642 + }, + { + "epoch": 1.243519394512772, + "grad_norm": 2.1518611907958984, + "learning_rate": 1.2877858949067564e-05, + "loss": 1.1309, + "step": 1643 + }, + { + "epoch": 1.2442762535477767, + "grad_norm": 2.2095835208892822, + "learning_rate": 1.2870169667931567e-05, + "loss": 1.109, + "step": 1644 + }, + { + "epoch": 1.2450331125827814, + "grad_norm": 2.3556344509124756, + "learning_rate": 1.2862478537372556e-05, + "loss": 1.0937, + "step": 1645 + }, + { + "epoch": 1.2457899716177863, + "grad_norm": 2.178274154663086, + "learning_rate": 1.2854785562348121e-05, + "loss": 1.1857, + "step": 1646 + }, + { + "epoch": 1.246546830652791, + "grad_norm": 2.1965596675872803, + "learning_rate": 1.2847090747817033e-05, + "loss": 1.1292, + "step": 1647 + }, + { + "epoch": 1.2473036896877956, + "grad_norm": 2.359292984008789, + "learning_rate": 1.2839394098739257e-05, + "loss": 1.1753, + "step": 1648 + }, + { + "epoch": 1.2480605487228003, + "grad_norm": 2.189749002456665, + "learning_rate": 1.2831695620075926e-05, + "loss": 1.0852, + "step": 1649 + }, + { + "epoch": 1.2488174077578051, + "grad_norm": 2.1658499240875244, + "learning_rate": 1.2823995316789366e-05, + "loss": 1.1278, + "step": 1650 + }, + { + "epoch": 1.2495742667928098, + "grad_norm": 2.249729871749878, + "learning_rate": 1.281629319384308e-05, + "loss": 1.1829, + "step": 1651 + }, + { + "epoch": 1.2503311258278145, + "grad_norm": 2.2419471740722656, + "learning_rate": 1.2808589256201735e-05, + "loss": 1.1373, + "step": 1652 + }, + { + "epoch": 1.2510879848628194, + "grad_norm": 2.296644449234009, + "learning_rate": 1.280088350883117e-05, + "loss": 1.1724, + "step": 1653 + }, + { + "epoch": 1.251844843897824, + "grad_norm": 2.77717661857605, + "learning_rate": 1.2793175956698398e-05, + "loss": 1.1367, + "step": 1654 + }, + { + "epoch": 1.2526017029328287, + "grad_norm": 2.4535298347473145, + "learning_rate": 1.2785466604771584e-05, + "loss": 1.1874, + "step": 1655 + }, + { + "epoch": 1.2533585619678336, + "grad_norm": 2.1663715839385986, + "learning_rate": 1.2777755458020058e-05, + "loss": 1.1819, + "step": 1656 + }, + { + "epoch": 1.2541154210028382, + "grad_norm": 2.467954158782959, + "learning_rate": 1.2770042521414314e-05, + "loss": 1.1761, + "step": 1657 + }, + { + "epoch": 1.254872280037843, + "grad_norm": 2.3556721210479736, + "learning_rate": 1.2762327799925991e-05, + "loss": 1.1574, + "step": 1658 + }, + { + "epoch": 1.2556291390728478, + "grad_norm": 2.204136371612549, + "learning_rate": 1.2754611298527875e-05, + "loss": 1.0962, + "step": 1659 + }, + { + "epoch": 1.2563859981078525, + "grad_norm": 2.3233225345611572, + "learning_rate": 1.274689302219391e-05, + "loss": 1.1369, + "step": 1660 + }, + { + "epoch": 1.2571428571428571, + "grad_norm": 2.5201222896575928, + "learning_rate": 1.2739172975899181e-05, + "loss": 1.1593, + "step": 1661 + }, + { + "epoch": 1.257899716177862, + "grad_norm": 2.531087875366211, + "learning_rate": 1.273145116461991e-05, + "loss": 1.1411, + "step": 1662 + }, + { + "epoch": 1.2586565752128667, + "grad_norm": 2.510352373123169, + "learning_rate": 1.2723727593333454e-05, + "loss": 1.1426, + "step": 1663 + }, + { + "epoch": 1.2594134342478713, + "grad_norm": 2.217392921447754, + "learning_rate": 1.2716002267018314e-05, + "loss": 1.0712, + "step": 1664 + }, + { + "epoch": 1.260170293282876, + "grad_norm": 2.3988654613494873, + "learning_rate": 1.2708275190654126e-05, + "loss": 1.1884, + "step": 1665 + }, + { + "epoch": 1.2609271523178807, + "grad_norm": 2.151139259338379, + "learning_rate": 1.2700546369221628e-05, + "loss": 1.095, + "step": 1666 + }, + { + "epoch": 1.2616840113528855, + "grad_norm": 2.3287789821624756, + "learning_rate": 1.2692815807702711e-05, + "loss": 1.1726, + "step": 1667 + }, + { + "epoch": 1.2624408703878902, + "grad_norm": 2.1874783039093018, + "learning_rate": 1.268508351108038e-05, + "loss": 1.1389, + "step": 1668 + }, + { + "epoch": 1.2631977294228949, + "grad_norm": 2.501871347427368, + "learning_rate": 1.2677349484338747e-05, + "loss": 1.1717, + "step": 1669 + }, + { + "epoch": 1.2639545884578998, + "grad_norm": 2.2890784740448, + "learning_rate": 1.2669613732463053e-05, + "loss": 1.1069, + "step": 1670 + }, + { + "epoch": 1.2647114474929044, + "grad_norm": 2.41701340675354, + "learning_rate": 1.2661876260439642e-05, + "loss": 1.142, + "step": 1671 + }, + { + "epoch": 1.265468306527909, + "grad_norm": 2.5459794998168945, + "learning_rate": 1.2654137073255976e-05, + "loss": 1.1071, + "step": 1672 + }, + { + "epoch": 1.266225165562914, + "grad_norm": 2.1220319271087646, + "learning_rate": 1.2646396175900612e-05, + "loss": 1.1644, + "step": 1673 + }, + { + "epoch": 1.2669820245979186, + "grad_norm": 2.383187770843506, + "learning_rate": 1.2638653573363215e-05, + "loss": 1.1515, + "step": 1674 + }, + { + "epoch": 1.2677388836329233, + "grad_norm": 2.090808868408203, + "learning_rate": 1.2630909270634554e-05, + "loss": 1.1151, + "step": 1675 + }, + { + "epoch": 1.2684957426679282, + "grad_norm": 2.306619882583618, + "learning_rate": 1.2623163272706483e-05, + "loss": 1.177, + "step": 1676 + }, + { + "epoch": 1.2692526017029329, + "grad_norm": 2.4056408405303955, + "learning_rate": 1.261541558457195e-05, + "loss": 1.1811, + "step": 1677 + }, + { + "epoch": 1.2700094607379375, + "grad_norm": 2.4404773712158203, + "learning_rate": 1.2607666211225002e-05, + "loss": 1.1686, + "step": 1678 + }, + { + "epoch": 1.2707663197729424, + "grad_norm": 2.1623356342315674, + "learning_rate": 1.2599915157660776e-05, + "loss": 1.1472, + "step": 1679 + }, + { + "epoch": 1.271523178807947, + "grad_norm": 2.5244863033294678, + "learning_rate": 1.2592162428875465e-05, + "loss": 1.1093, + "step": 1680 + }, + { + "epoch": 1.2722800378429517, + "grad_norm": 2.649132251739502, + "learning_rate": 1.2584408029866373e-05, + "loss": 1.1644, + "step": 1681 + }, + { + "epoch": 1.2730368968779564, + "grad_norm": 2.245384931564331, + "learning_rate": 1.2576651965631862e-05, + "loss": 1.2137, + "step": 1682 + }, + { + "epoch": 1.2737937559129613, + "grad_norm": 3.0994908809661865, + "learning_rate": 1.256889424117137e-05, + "loss": 1.1189, + "step": 1683 + }, + { + "epoch": 1.274550614947966, + "grad_norm": 2.228210210800171, + "learning_rate": 1.2561134861485413e-05, + "loss": 1.1694, + "step": 1684 + }, + { + "epoch": 1.2753074739829706, + "grad_norm": 2.0974786281585693, + "learning_rate": 1.2553373831575572e-05, + "loss": 1.1661, + "step": 1685 + }, + { + "epoch": 1.2760643330179753, + "grad_norm": 2.1458041667938232, + "learning_rate": 1.2545611156444477e-05, + "loss": 1.0814, + "step": 1686 + }, + { + "epoch": 1.2768211920529802, + "grad_norm": 2.163115978240967, + "learning_rate": 1.253784684109584e-05, + "loss": 1.1643, + "step": 1687 + }, + { + "epoch": 1.2775780510879848, + "grad_norm": 2.122997999191284, + "learning_rate": 1.2530080890534416e-05, + "loss": 1.172, + "step": 1688 + }, + { + "epoch": 1.2783349101229895, + "grad_norm": 2.4748451709747314, + "learning_rate": 1.2522313309766021e-05, + "loss": 1.1489, + "step": 1689 + }, + { + "epoch": 1.2790917691579944, + "grad_norm": 2.201387882232666, + "learning_rate": 1.2514544103797517e-05, + "loss": 1.1509, + "step": 1690 + }, + { + "epoch": 1.279848628192999, + "grad_norm": 2.158069610595703, + "learning_rate": 1.2506773277636812e-05, + "loss": 1.1284, + "step": 1691 + }, + { + "epoch": 1.2806054872280037, + "grad_norm": 2.192920684814453, + "learning_rate": 1.2499000836292875e-05, + "loss": 1.156, + "step": 1692 + }, + { + "epoch": 1.2813623462630086, + "grad_norm": 2.266641855239868, + "learning_rate": 1.2491226784775685e-05, + "loss": 1.1298, + "step": 1693 + }, + { + "epoch": 1.2821192052980133, + "grad_norm": 2.677654981613159, + "learning_rate": 1.2483451128096289e-05, + "loss": 1.1472, + "step": 1694 + }, + { + "epoch": 1.282876064333018, + "grad_norm": 2.4137139320373535, + "learning_rate": 1.2475673871266756e-05, + "loss": 1.075, + "step": 1695 + }, + { + "epoch": 1.2836329233680228, + "grad_norm": 2.201813220977783, + "learning_rate": 1.2467895019300187e-05, + "loss": 1.1224, + "step": 1696 + }, + { + "epoch": 1.2843897824030275, + "grad_norm": 2.1659185886383057, + "learning_rate": 1.2460114577210703e-05, + "loss": 1.1606, + "step": 1697 + }, + { + "epoch": 1.2851466414380321, + "grad_norm": 2.3215322494506836, + "learning_rate": 1.245233255001347e-05, + "loss": 1.1408, + "step": 1698 + }, + { + "epoch": 1.285903500473037, + "grad_norm": 2.530764579772949, + "learning_rate": 1.2444548942724657e-05, + "loss": 1.1433, + "step": 1699 + }, + { + "epoch": 1.2866603595080417, + "grad_norm": 2.5110771656036377, + "learning_rate": 1.2436763760361461e-05, + "loss": 1.1644, + "step": 1700 + }, + { + "epoch": 1.2874172185430464, + "grad_norm": 2.0336203575134277, + "learning_rate": 1.2428977007942092e-05, + "loss": 1.1538, + "step": 1701 + }, + { + "epoch": 1.288174077578051, + "grad_norm": 2.3316948413848877, + "learning_rate": 1.2421188690485767e-05, + "loss": 1.1349, + "step": 1702 + }, + { + "epoch": 1.2889309366130557, + "grad_norm": 2.151745319366455, + "learning_rate": 1.241339881301273e-05, + "loss": 1.1464, + "step": 1703 + }, + { + "epoch": 1.2896877956480606, + "grad_norm": 2.1439285278320312, + "learning_rate": 1.2405607380544198e-05, + "loss": 1.1384, + "step": 1704 + }, + { + "epoch": 1.2904446546830652, + "grad_norm": 2.0676236152648926, + "learning_rate": 1.239781439810242e-05, + "loss": 1.1315, + "step": 1705 + }, + { + "epoch": 1.29120151371807, + "grad_norm": 2.1534860134124756, + "learning_rate": 1.239001987071064e-05, + "loss": 1.1232, + "step": 1706 + }, + { + "epoch": 1.2919583727530748, + "grad_norm": 2.4337046146392822, + "learning_rate": 1.238222380339308e-05, + "loss": 1.1637, + "step": 1707 + }, + { + "epoch": 1.2927152317880795, + "grad_norm": 3.3103768825531006, + "learning_rate": 1.2374426201174974e-05, + "loss": 1.1255, + "step": 1708 + }, + { + "epoch": 1.2934720908230841, + "grad_norm": 2.3964853286743164, + "learning_rate": 1.2366627069082533e-05, + "loss": 1.1474, + "step": 1709 + }, + { + "epoch": 1.294228949858089, + "grad_norm": 2.196171760559082, + "learning_rate": 1.235882641214296e-05, + "loss": 1.1152, + "step": 1710 + }, + { + "epoch": 1.2949858088930937, + "grad_norm": 2.2231311798095703, + "learning_rate": 1.2351024235384435e-05, + "loss": 1.0872, + "step": 1711 + }, + { + "epoch": 1.2957426679280983, + "grad_norm": 3.2890310287475586, + "learning_rate": 1.2343220543836132e-05, + "loss": 1.1376, + "step": 1712 + }, + { + "epoch": 1.2964995269631032, + "grad_norm": 2.37038516998291, + "learning_rate": 1.2335415342528186e-05, + "loss": 1.1309, + "step": 1713 + }, + { + "epoch": 1.2972563859981079, + "grad_norm": 2.0955164432525635, + "learning_rate": 1.2327608636491706e-05, + "loss": 1.1721, + "step": 1714 + }, + { + "epoch": 1.2980132450331126, + "grad_norm": 2.3492562770843506, + "learning_rate": 1.2319800430758787e-05, + "loss": 1.1855, + "step": 1715 + }, + { + "epoch": 1.2987701040681174, + "grad_norm": 2.366028308868408, + "learning_rate": 1.231199073036247e-05, + "loss": 1.1547, + "step": 1716 + }, + { + "epoch": 1.299526963103122, + "grad_norm": 2.163280725479126, + "learning_rate": 1.230417954033677e-05, + "loss": 1.1289, + "step": 1717 + }, + { + "epoch": 1.3002838221381268, + "grad_norm": 2.1231632232666016, + "learning_rate": 1.2296366865716663e-05, + "loss": 1.1386, + "step": 1718 + }, + { + "epoch": 1.3010406811731314, + "grad_norm": 2.1293210983276367, + "learning_rate": 1.2288552711538076e-05, + "loss": 1.185, + "step": 1719 + }, + { + "epoch": 1.3017975402081363, + "grad_norm": 2.0795953273773193, + "learning_rate": 1.2280737082837903e-05, + "loss": 1.0935, + "step": 1720 + }, + { + "epoch": 1.302554399243141, + "grad_norm": 2.1011762619018555, + "learning_rate": 1.2272919984653972e-05, + "loss": 1.1672, + "step": 1721 + }, + { + "epoch": 1.3033112582781456, + "grad_norm": 2.221156120300293, + "learning_rate": 1.2265101422025064e-05, + "loss": 1.1073, + "step": 1722 + }, + { + "epoch": 1.3040681173131503, + "grad_norm": 2.2249984741210938, + "learning_rate": 1.2257281399990913e-05, + "loss": 1.1659, + "step": 1723 + }, + { + "epoch": 1.3048249763481552, + "grad_norm": 4.573660850524902, + "learning_rate": 1.2249459923592176e-05, + "loss": 1.1835, + "step": 1724 + }, + { + "epoch": 1.3055818353831599, + "grad_norm": 2.1640846729278564, + "learning_rate": 1.2241636997870459e-05, + "loss": 1.1342, + "step": 1725 + }, + { + "epoch": 1.3063386944181645, + "grad_norm": 2.010333299636841, + "learning_rate": 1.223381262786831e-05, + "loss": 1.1, + "step": 1726 + }, + { + "epoch": 1.3070955534531694, + "grad_norm": 2.026108980178833, + "learning_rate": 1.2225986818629188e-05, + "loss": 1.1424, + "step": 1727 + }, + { + "epoch": 1.307852412488174, + "grad_norm": 2.0564801692962646, + "learning_rate": 1.22181595751975e-05, + "loss": 1.1336, + "step": 1728 + }, + { + "epoch": 1.3086092715231787, + "grad_norm": 1.9752734899520874, + "learning_rate": 1.2210330902618555e-05, + "loss": 1.1617, + "step": 1729 + }, + { + "epoch": 1.3093661305581836, + "grad_norm": 2.1132519245147705, + "learning_rate": 1.2202500805938606e-05, + "loss": 1.1841, + "step": 1730 + }, + { + "epoch": 1.3101229895931883, + "grad_norm": 2.167475461959839, + "learning_rate": 1.2194669290204813e-05, + "loss": 1.1255, + "step": 1731 + }, + { + "epoch": 1.310879848628193, + "grad_norm": 2.1125247478485107, + "learning_rate": 1.2186836360465249e-05, + "loss": 1.1434, + "step": 1732 + }, + { + "epoch": 1.3116367076631978, + "grad_norm": 2.126776933670044, + "learning_rate": 1.21790020217689e-05, + "loss": 1.1626, + "step": 1733 + }, + { + "epoch": 1.3123935666982025, + "grad_norm": 2.1454262733459473, + "learning_rate": 1.2171166279165668e-05, + "loss": 1.116, + "step": 1734 + }, + { + "epoch": 1.3131504257332072, + "grad_norm": 2.166027545928955, + "learning_rate": 1.216332913770634e-05, + "loss": 1.145, + "step": 1735 + }, + { + "epoch": 1.313907284768212, + "grad_norm": 2.0442612171173096, + "learning_rate": 1.2155490602442628e-05, + "loss": 1.1443, + "step": 1736 + }, + { + "epoch": 1.3146641438032167, + "grad_norm": 2.2408742904663086, + "learning_rate": 1.2147650678427136e-05, + "loss": 1.1297, + "step": 1737 + }, + { + "epoch": 1.3154210028382214, + "grad_norm": 2.039287805557251, + "learning_rate": 1.213980937071335e-05, + "loss": 1.1183, + "step": 1738 + }, + { + "epoch": 1.316177861873226, + "grad_norm": 2.4958298206329346, + "learning_rate": 1.213196668435566e-05, + "loss": 1.1127, + "step": 1739 + }, + { + "epoch": 1.3169347209082307, + "grad_norm": 2.0684995651245117, + "learning_rate": 1.212412262440935e-05, + "loss": 1.1092, + "step": 1740 + }, + { + "epoch": 1.3176915799432356, + "grad_norm": 2.2518489360809326, + "learning_rate": 1.2116277195930566e-05, + "loss": 1.1256, + "step": 1741 + }, + { + "epoch": 1.3184484389782403, + "grad_norm": 2.2096285820007324, + "learning_rate": 1.2108430403976363e-05, + "loss": 1.1785, + "step": 1742 + }, + { + "epoch": 1.319205298013245, + "grad_norm": 2.1161551475524902, + "learning_rate": 1.2100582253604663e-05, + "loss": 1.1816, + "step": 1743 + }, + { + "epoch": 1.3199621570482498, + "grad_norm": 2.2261106967926025, + "learning_rate": 1.2092732749874258e-05, + "loss": 1.1512, + "step": 1744 + }, + { + "epoch": 1.3207190160832545, + "grad_norm": 2.4035398960113525, + "learning_rate": 1.2084881897844827e-05, + "loss": 1.1505, + "step": 1745 + }, + { + "epoch": 1.3214758751182591, + "grad_norm": 2.2090861797332764, + "learning_rate": 1.2077029702576898e-05, + "loss": 1.1521, + "step": 1746 + }, + { + "epoch": 1.322232734153264, + "grad_norm": 2.054429769515991, + "learning_rate": 1.2069176169131889e-05, + "loss": 1.1863, + "step": 1747 + }, + { + "epoch": 1.3229895931882687, + "grad_norm": 2.0456814765930176, + "learning_rate": 1.2061321302572063e-05, + "loss": 1.1125, + "step": 1748 + }, + { + "epoch": 1.3237464522232734, + "grad_norm": 2.066861629486084, + "learning_rate": 1.2053465107960536e-05, + "loss": 1.1107, + "step": 1749 + }, + { + "epoch": 1.3245033112582782, + "grad_norm": 2.2116339206695557, + "learning_rate": 1.204560759036131e-05, + "loss": 1.1796, + "step": 1750 + }, + { + "epoch": 1.325260170293283, + "grad_norm": 2.355694532394409, + "learning_rate": 1.203774875483921e-05, + "loss": 1.1221, + "step": 1751 + }, + { + "epoch": 1.3260170293282876, + "grad_norm": 2.3318047523498535, + "learning_rate": 1.202988860645992e-05, + "loss": 1.1482, + "step": 1752 + }, + { + "epoch": 1.3267738883632925, + "grad_norm": 2.3973910808563232, + "learning_rate": 1.202202715028998e-05, + "loss": 1.09, + "step": 1753 + }, + { + "epoch": 1.3275307473982971, + "grad_norm": 2.3162357807159424, + "learning_rate": 1.2014164391396761e-05, + "loss": 1.1362, + "step": 1754 + }, + { + "epoch": 1.3282876064333018, + "grad_norm": 2.232311964035034, + "learning_rate": 1.2006300334848472e-05, + "loss": 1.1419, + "step": 1755 + }, + { + "epoch": 1.3290444654683065, + "grad_norm": 2.3239498138427734, + "learning_rate": 1.1998434985714172e-05, + "loss": 1.151, + "step": 1756 + }, + { + "epoch": 1.3298013245033111, + "grad_norm": 2.5459787845611572, + "learning_rate": 1.1990568349063742e-05, + "loss": 1.1649, + "step": 1757 + }, + { + "epoch": 1.330558183538316, + "grad_norm": 2.184105396270752, + "learning_rate": 1.1982700429967893e-05, + "loss": 1.1334, + "step": 1758 + }, + { + "epoch": 1.3313150425733207, + "grad_norm": 2.092010498046875, + "learning_rate": 1.1974831233498175e-05, + "loss": 1.0945, + "step": 1759 + }, + { + "epoch": 1.3320719016083253, + "grad_norm": 2.204160690307617, + "learning_rate": 1.1966960764726937e-05, + "loss": 1.1411, + "step": 1760 + }, + { + "epoch": 1.3328287606433302, + "grad_norm": 2.467329978942871, + "learning_rate": 1.195908902872738e-05, + "loss": 1.1259, + "step": 1761 + }, + { + "epoch": 1.3335856196783349, + "grad_norm": 2.2322754859924316, + "learning_rate": 1.1951216030573489e-05, + "loss": 1.1204, + "step": 1762 + }, + { + "epoch": 1.3343424787133396, + "grad_norm": 2.1422557830810547, + "learning_rate": 1.1943341775340087e-05, + "loss": 1.1306, + "step": 1763 + }, + { + "epoch": 1.3350993377483444, + "grad_norm": 2.393411159515381, + "learning_rate": 1.1935466268102802e-05, + "loss": 1.1409, + "step": 1764 + }, + { + "epoch": 1.335856196783349, + "grad_norm": 2.2010276317596436, + "learning_rate": 1.192758951393806e-05, + "loss": 1.0952, + "step": 1765 + }, + { + "epoch": 1.3366130558183538, + "grad_norm": 2.128002166748047, + "learning_rate": 1.1919711517923095e-05, + "loss": 1.1084, + "step": 1766 + }, + { + "epoch": 1.3373699148533587, + "grad_norm": 2.090876340866089, + "learning_rate": 1.1911832285135953e-05, + "loss": 1.1409, + "step": 1767 + }, + { + "epoch": 1.3381267738883633, + "grad_norm": 2.232081890106201, + "learning_rate": 1.1903951820655458e-05, + "loss": 1.176, + "step": 1768 + }, + { + "epoch": 1.338883632923368, + "grad_norm": 2.2187860012054443, + "learning_rate": 1.1896070129561237e-05, + "loss": 1.1094, + "step": 1769 + }, + { + "epoch": 1.3396404919583729, + "grad_norm": 2.2788565158843994, + "learning_rate": 1.1888187216933715e-05, + "loss": 1.1302, + "step": 1770 + }, + { + "epoch": 1.3403973509933775, + "grad_norm": 2.153656482696533, + "learning_rate": 1.1880303087854093e-05, + "loss": 1.1742, + "step": 1771 + }, + { + "epoch": 1.3411542100283822, + "grad_norm": 2.2120232582092285, + "learning_rate": 1.187241774740436e-05, + "loss": 1.1553, + "step": 1772 + }, + { + "epoch": 1.3419110690633869, + "grad_norm": 2.09063720703125, + "learning_rate": 1.1864531200667283e-05, + "loss": 1.1231, + "step": 1773 + }, + { + "epoch": 1.3426679280983917, + "grad_norm": 2.186126232147217, + "learning_rate": 1.1856643452726417e-05, + "loss": 1.1121, + "step": 1774 + }, + { + "epoch": 1.3434247871333964, + "grad_norm": 2.706040620803833, + "learning_rate": 1.1848754508666084e-05, + "loss": 1.1323, + "step": 1775 + }, + { + "epoch": 1.344181646168401, + "grad_norm": 2.1138103008270264, + "learning_rate": 1.1840864373571368e-05, + "loss": 1.1255, + "step": 1776 + }, + { + "epoch": 1.3449385052034057, + "grad_norm": 2.1181037425994873, + "learning_rate": 1.1832973052528136e-05, + "loss": 1.1406, + "step": 1777 + }, + { + "epoch": 1.3456953642384106, + "grad_norm": 2.0773799419403076, + "learning_rate": 1.1825080550623014e-05, + "loss": 1.1159, + "step": 1778 + }, + { + "epoch": 1.3464522232734153, + "grad_norm": 2.3848013877868652, + "learning_rate": 1.1817186872943385e-05, + "loss": 1.1687, + "step": 1779 + }, + { + "epoch": 1.34720908230842, + "grad_norm": 2.052957534790039, + "learning_rate": 1.1809292024577397e-05, + "loss": 1.176, + "step": 1780 + }, + { + "epoch": 1.3479659413434248, + "grad_norm": 2.1066739559173584, + "learning_rate": 1.1801396010613947e-05, + "loss": 1.1563, + "step": 1781 + }, + { + "epoch": 1.3487228003784295, + "grad_norm": 2.2263689041137695, + "learning_rate": 1.1793498836142685e-05, + "loss": 1.1763, + "step": 1782 + }, + { + "epoch": 1.3494796594134342, + "grad_norm": 2.263129711151123, + "learning_rate": 1.178560050625401e-05, + "loss": 1.1737, + "step": 1783 + }, + { + "epoch": 1.350236518448439, + "grad_norm": 2.3742623329162598, + "learning_rate": 1.1777701026039063e-05, + "loss": 1.123, + "step": 1784 + }, + { + "epoch": 1.3509933774834437, + "grad_norm": 2.1886773109436035, + "learning_rate": 1.1769800400589733e-05, + "loss": 1.1635, + "step": 1785 + }, + { + "epoch": 1.3517502365184484, + "grad_norm": 2.190129041671753, + "learning_rate": 1.1761898634998635e-05, + "loss": 1.1131, + "step": 1786 + }, + { + "epoch": 1.3525070955534533, + "grad_norm": 2.258070468902588, + "learning_rate": 1.1753995734359131e-05, + "loss": 1.1794, + "step": 1787 + }, + { + "epoch": 1.353263954588458, + "grad_norm": 2.0639896392822266, + "learning_rate": 1.1746091703765316e-05, + "loss": 1.1047, + "step": 1788 + }, + { + "epoch": 1.3540208136234626, + "grad_norm": 2.4623801708221436, + "learning_rate": 1.1738186548311998e-05, + "loss": 1.1642, + "step": 1789 + }, + { + "epoch": 1.3547776726584675, + "grad_norm": 2.082294225692749, + "learning_rate": 1.1730280273094724e-05, + "loss": 1.1374, + "step": 1790 + }, + { + "epoch": 1.3555345316934722, + "grad_norm": 1.9872076511383057, + "learning_rate": 1.1722372883209758e-05, + "loss": 1.1305, + "step": 1791 + }, + { + "epoch": 1.3562913907284768, + "grad_norm": 2.2372727394104004, + "learning_rate": 1.1714464383754085e-05, + "loss": 1.1261, + "step": 1792 + }, + { + "epoch": 1.3570482497634815, + "grad_norm": 2.16003680229187, + "learning_rate": 1.1706554779825399e-05, + "loss": 1.1289, + "step": 1793 + }, + { + "epoch": 1.3578051087984861, + "grad_norm": 2.2329182624816895, + "learning_rate": 1.1698644076522112e-05, + "loss": 1.1331, + "step": 1794 + }, + { + "epoch": 1.358561967833491, + "grad_norm": 2.2425284385681152, + "learning_rate": 1.1690732278943344e-05, + "loss": 1.2247, + "step": 1795 + }, + { + "epoch": 1.3593188268684957, + "grad_norm": 2.589672803878784, + "learning_rate": 1.1682819392188917e-05, + "loss": 1.144, + "step": 1796 + }, + { + "epoch": 1.3600756859035004, + "grad_norm": 2.2635231018066406, + "learning_rate": 1.1674905421359358e-05, + "loss": 1.1585, + "step": 1797 + }, + { + "epoch": 1.3608325449385053, + "grad_norm": 2.291184425354004, + "learning_rate": 1.1666990371555893e-05, + "loss": 1.1063, + "step": 1798 + }, + { + "epoch": 1.36158940397351, + "grad_norm": 2.289581298828125, + "learning_rate": 1.1659074247880442e-05, + "loss": 1.183, + "step": 1799 + }, + { + "epoch": 1.3623462630085146, + "grad_norm": 2.1125857830047607, + "learning_rate": 1.1651157055435616e-05, + "loss": 1.1226, + "step": 1800 + }, + { + "epoch": 1.3631031220435195, + "grad_norm": 2.1084022521972656, + "learning_rate": 1.1643238799324714e-05, + "loss": 1.1741, + "step": 1801 + }, + { + "epoch": 1.3638599810785241, + "grad_norm": 2.2463252544403076, + "learning_rate": 1.1635319484651733e-05, + "loss": 1.1459, + "step": 1802 + }, + { + "epoch": 1.3646168401135288, + "grad_norm": 2.2021613121032715, + "learning_rate": 1.1627399116521334e-05, + "loss": 1.1939, + "step": 1803 + }, + { + "epoch": 1.3653736991485337, + "grad_norm": 2.190654754638672, + "learning_rate": 1.1619477700038863e-05, + "loss": 1.0967, + "step": 1804 + }, + { + "epoch": 1.3661305581835383, + "grad_norm": 1.9912575483322144, + "learning_rate": 1.1611555240310356e-05, + "loss": 1.1268, + "step": 1805 + }, + { + "epoch": 1.366887417218543, + "grad_norm": 2.1702189445495605, + "learning_rate": 1.16036317424425e-05, + "loss": 1.1957, + "step": 1806 + }, + { + "epoch": 1.367644276253548, + "grad_norm": 2.0921695232391357, + "learning_rate": 1.1595707211542662e-05, + "loss": 1.1161, + "step": 1807 + }, + { + "epoch": 1.3684011352885526, + "grad_norm": 2.1319305896759033, + "learning_rate": 1.1587781652718877e-05, + "loss": 1.1411, + "step": 1808 + }, + { + "epoch": 1.3691579943235572, + "grad_norm": 2.2225658893585205, + "learning_rate": 1.1579855071079838e-05, + "loss": 1.1259, + "step": 1809 + }, + { + "epoch": 1.369914853358562, + "grad_norm": 1.943051815032959, + "learning_rate": 1.1571927471734894e-05, + "loss": 1.1088, + "step": 1810 + }, + { + "epoch": 1.3706717123935668, + "grad_norm": 2.3888943195343018, + "learning_rate": 1.156399885979406e-05, + "loss": 1.1416, + "step": 1811 + }, + { + "epoch": 1.3714285714285714, + "grad_norm": 2.145301580429077, + "learning_rate": 1.1556069240368002e-05, + "loss": 1.1412, + "step": 1812 + }, + { + "epoch": 1.372185430463576, + "grad_norm": 2.0973587036132812, + "learning_rate": 1.1548138618568023e-05, + "loss": 1.1358, + "step": 1813 + }, + { + "epoch": 1.3729422894985808, + "grad_norm": 2.2998507022857666, + "learning_rate": 1.1540206999506086e-05, + "loss": 1.152, + "step": 1814 + }, + { + "epoch": 1.3736991485335857, + "grad_norm": 2.1464428901672363, + "learning_rate": 1.1532274388294789e-05, + "loss": 1.105, + "step": 1815 + }, + { + "epoch": 1.3744560075685903, + "grad_norm": 2.0508806705474854, + "learning_rate": 1.152434079004738e-05, + "loss": 1.1425, + "step": 1816 + }, + { + "epoch": 1.375212866603595, + "grad_norm": 2.05656099319458, + "learning_rate": 1.151640620987772e-05, + "loss": 1.1042, + "step": 1817 + }, + { + "epoch": 1.3759697256385999, + "grad_norm": 2.153604030609131, + "learning_rate": 1.1508470652900332e-05, + "loss": 1.1361, + "step": 1818 + }, + { + "epoch": 1.3767265846736045, + "grad_norm": 2.9740560054779053, + "learning_rate": 1.1500534124230354e-05, + "loss": 1.1646, + "step": 1819 + }, + { + "epoch": 1.3774834437086092, + "grad_norm": 2.213672637939453, + "learning_rate": 1.149259662898354e-05, + "loss": 1.1348, + "step": 1820 + }, + { + "epoch": 1.378240302743614, + "grad_norm": 2.2703373432159424, + "learning_rate": 1.148465817227629e-05, + "loss": 1.1456, + "step": 1821 + }, + { + "epoch": 1.3789971617786188, + "grad_norm": 2.1815407276153564, + "learning_rate": 1.1476718759225611e-05, + "loss": 1.16, + "step": 1822 + }, + { + "epoch": 1.3797540208136234, + "grad_norm": 2.198272943496704, + "learning_rate": 1.1468778394949123e-05, + "loss": 1.1677, + "step": 1823 + }, + { + "epoch": 1.3805108798486283, + "grad_norm": 2.1629281044006348, + "learning_rate": 1.1460837084565064e-05, + "loss": 1.1269, + "step": 1824 + }, + { + "epoch": 1.381267738883633, + "grad_norm": 1.9979993104934692, + "learning_rate": 1.1452894833192287e-05, + "loss": 1.1243, + "step": 1825 + }, + { + "epoch": 1.3820245979186376, + "grad_norm": 2.1406540870666504, + "learning_rate": 1.144495164595024e-05, + "loss": 1.1819, + "step": 1826 + }, + { + "epoch": 1.3827814569536425, + "grad_norm": 2.2074644565582275, + "learning_rate": 1.1437007527958985e-05, + "loss": 1.1368, + "step": 1827 + }, + { + "epoch": 1.3835383159886472, + "grad_norm": 2.279019355773926, + "learning_rate": 1.1429062484339175e-05, + "loss": 1.1293, + "step": 1828 + }, + { + "epoch": 1.3842951750236518, + "grad_norm": 2.179516315460205, + "learning_rate": 1.1421116520212066e-05, + "loss": 1.1538, + "step": 1829 + }, + { + "epoch": 1.3850520340586565, + "grad_norm": 2.0977933406829834, + "learning_rate": 1.1413169640699505e-05, + "loss": 1.1259, + "step": 1830 + }, + { + "epoch": 1.3858088930936612, + "grad_norm": 2.1527068614959717, + "learning_rate": 1.1405221850923932e-05, + "loss": 1.0934, + "step": 1831 + }, + { + "epoch": 1.386565752128666, + "grad_norm": 2.2525691986083984, + "learning_rate": 1.1397273156008364e-05, + "loss": 1.2084, + "step": 1832 + }, + { + "epoch": 1.3873226111636707, + "grad_norm": 2.0335781574249268, + "learning_rate": 1.1389323561076419e-05, + "loss": 1.1224, + "step": 1833 + }, + { + "epoch": 1.3880794701986754, + "grad_norm": 2.142940044403076, + "learning_rate": 1.1381373071252273e-05, + "loss": 1.0934, + "step": 1834 + }, + { + "epoch": 1.3888363292336803, + "grad_norm": 2.2513163089752197, + "learning_rate": 1.1373421691660697e-05, + "loss": 1.161, + "step": 1835 + }, + { + "epoch": 1.389593188268685, + "grad_norm": 2.1784231662750244, + "learning_rate": 1.1365469427427037e-05, + "loss": 1.127, + "step": 1836 + }, + { + "epoch": 1.3903500473036896, + "grad_norm": 2.2235348224639893, + "learning_rate": 1.1357516283677185e-05, + "loss": 1.1595, + "step": 1837 + }, + { + "epoch": 1.3911069063386945, + "grad_norm": 2.08614182472229, + "learning_rate": 1.1349562265537626e-05, + "loss": 1.1083, + "step": 1838 + }, + { + "epoch": 1.3918637653736992, + "grad_norm": 1.9127520322799683, + "learning_rate": 1.1341607378135395e-05, + "loss": 1.1516, + "step": 1839 + }, + { + "epoch": 1.3926206244087038, + "grad_norm": 2.100748300552368, + "learning_rate": 1.1333651626598095e-05, + "loss": 1.1306, + "step": 1840 + }, + { + "epoch": 1.3933774834437087, + "grad_norm": 2.3174188137054443, + "learning_rate": 1.1325695016053878e-05, + "loss": 1.1893, + "step": 1841 + }, + { + "epoch": 1.3941343424787134, + "grad_norm": 2.4146411418914795, + "learning_rate": 1.1317737551631455e-05, + "loss": 1.1463, + "step": 1842 + }, + { + "epoch": 1.394891201513718, + "grad_norm": 2.604128837585449, + "learning_rate": 1.130977923846009e-05, + "loss": 1.131, + "step": 1843 + }, + { + "epoch": 1.395648060548723, + "grad_norm": 2.1692941188812256, + "learning_rate": 1.1301820081669586e-05, + "loss": 1.1504, + "step": 1844 + }, + { + "epoch": 1.3964049195837276, + "grad_norm": 1.9960031509399414, + "learning_rate": 1.1293860086390294e-05, + "loss": 1.133, + "step": 1845 + }, + { + "epoch": 1.3971617786187323, + "grad_norm": 2.2130203247070312, + "learning_rate": 1.1285899257753105e-05, + "loss": 1.1375, + "step": 1846 + }, + { + "epoch": 1.397918637653737, + "grad_norm": 2.1830358505249023, + "learning_rate": 1.1277937600889458e-05, + "loss": 1.1391, + "step": 1847 + }, + { + "epoch": 1.3986754966887418, + "grad_norm": 2.188948392868042, + "learning_rate": 1.1269975120931301e-05, + "loss": 1.1241, + "step": 1848 + }, + { + "epoch": 1.3994323557237465, + "grad_norm": 2.3037242889404297, + "learning_rate": 1.1262011823011132e-05, + "loss": 1.1221, + "step": 1849 + }, + { + "epoch": 1.4001892147587511, + "grad_norm": 2.0598981380462646, + "learning_rate": 1.1254047712261975e-05, + "loss": 1.108, + "step": 1850 + }, + { + "epoch": 1.4009460737937558, + "grad_norm": 2.116628885269165, + "learning_rate": 1.1246082793817372e-05, + "loss": 1.1166, + "step": 1851 + }, + { + "epoch": 1.4017029328287607, + "grad_norm": 2.09624981880188, + "learning_rate": 1.1238117072811389e-05, + "loss": 1.1216, + "step": 1852 + }, + { + "epoch": 1.4024597918637653, + "grad_norm": 2.092494487762451, + "learning_rate": 1.1230150554378606e-05, + "loss": 1.1447, + "step": 1853 + }, + { + "epoch": 1.40321665089877, + "grad_norm": 1.9815505743026733, + "learning_rate": 1.1222183243654119e-05, + "loss": 1.1939, + "step": 1854 + }, + { + "epoch": 1.403973509933775, + "grad_norm": 2.285538673400879, + "learning_rate": 1.121421514577354e-05, + "loss": 1.1981, + "step": 1855 + }, + { + "epoch": 1.4047303689687796, + "grad_norm": 2.4579432010650635, + "learning_rate": 1.1206246265872975e-05, + "loss": 1.1246, + "step": 1856 + }, + { + "epoch": 1.4054872280037842, + "grad_norm": 2.2195796966552734, + "learning_rate": 1.1198276609089051e-05, + "loss": 1.0943, + "step": 1857 + }, + { + "epoch": 1.4062440870387891, + "grad_norm": 2.3332061767578125, + "learning_rate": 1.1190306180558886e-05, + "loss": 1.1896, + "step": 1858 + }, + { + "epoch": 1.4070009460737938, + "grad_norm": 2.257955551147461, + "learning_rate": 1.1182334985420088e-05, + "loss": 1.1565, + "step": 1859 + }, + { + "epoch": 1.4077578051087984, + "grad_norm": 2.1527364253997803, + "learning_rate": 1.1174363028810782e-05, + "loss": 1.1269, + "step": 1860 + }, + { + "epoch": 1.4085146641438033, + "grad_norm": 2.168989896774292, + "learning_rate": 1.1166390315869555e-05, + "loss": 1.118, + "step": 1861 + }, + { + "epoch": 1.409271523178808, + "grad_norm": 2.1610758304595947, + "learning_rate": 1.1158416851735505e-05, + "loss": 1.1126, + "step": 1862 + }, + { + "epoch": 1.4100283822138127, + "grad_norm": 2.423572063446045, + "learning_rate": 1.1150442641548205e-05, + "loss": 1.1681, + "step": 1863 + }, + { + "epoch": 1.4107852412488175, + "grad_norm": 2.1142797470092773, + "learning_rate": 1.1142467690447708e-05, + "loss": 1.1159, + "step": 1864 + }, + { + "epoch": 1.4115421002838222, + "grad_norm": 2.206160068511963, + "learning_rate": 1.1134492003574541e-05, + "loss": 1.1007, + "step": 1865 + }, + { + "epoch": 1.4122989593188269, + "grad_norm": 2.223226547241211, + "learning_rate": 1.1126515586069716e-05, + "loss": 1.0648, + "step": 1866 + }, + { + "epoch": 1.4130558183538315, + "grad_norm": 2.488703966140747, + "learning_rate": 1.1118538443074713e-05, + "loss": 1.107, + "step": 1867 + }, + { + "epoch": 1.4138126773888362, + "grad_norm": 2.2958545684814453, + "learning_rate": 1.1110560579731469e-05, + "loss": 1.1553, + "step": 1868 + }, + { + "epoch": 1.414569536423841, + "grad_norm": 2.343440055847168, + "learning_rate": 1.1102582001182399e-05, + "loss": 1.1225, + "step": 1869 + }, + { + "epoch": 1.4153263954588458, + "grad_norm": 2.963460922241211, + "learning_rate": 1.1094602712570366e-05, + "loss": 1.1211, + "step": 1870 + }, + { + "epoch": 1.4160832544938504, + "grad_norm": 2.123777151107788, + "learning_rate": 1.1086622719038708e-05, + "loss": 1.0919, + "step": 1871 + }, + { + "epoch": 1.4168401135288553, + "grad_norm": 2.1496341228485107, + "learning_rate": 1.1078642025731197e-05, + "loss": 1.0807, + "step": 1872 + }, + { + "epoch": 1.41759697256386, + "grad_norm": 2.147340774536133, + "learning_rate": 1.107066063779207e-05, + "loss": 1.1372, + "step": 1873 + }, + { + "epoch": 1.4183538315988646, + "grad_norm": 2.05765438079834, + "learning_rate": 1.1062678560366013e-05, + "loss": 1.1531, + "step": 1874 + }, + { + "epoch": 1.4191106906338695, + "grad_norm": 2.409080982208252, + "learning_rate": 1.1054695798598142e-05, + "loss": 1.1531, + "step": 1875 + }, + { + "epoch": 1.4198675496688742, + "grad_norm": 1.9932847023010254, + "learning_rate": 1.104671235763403e-05, + "loss": 1.1766, + "step": 1876 + }, + { + "epoch": 1.4206244087038788, + "grad_norm": 2.2019896507263184, + "learning_rate": 1.1038728242619686e-05, + "loss": 1.1037, + "step": 1877 + }, + { + "epoch": 1.4213812677388837, + "grad_norm": 2.263040065765381, + "learning_rate": 1.1030743458701533e-05, + "loss": 1.1475, + "step": 1878 + }, + { + "epoch": 1.4221381267738884, + "grad_norm": 2.0611464977264404, + "learning_rate": 1.1022758011026455e-05, + "loss": 1.1652, + "step": 1879 + }, + { + "epoch": 1.422894985808893, + "grad_norm": 2.175058364868164, + "learning_rate": 1.1014771904741746e-05, + "loss": 1.1164, + "step": 1880 + }, + { + "epoch": 1.423651844843898, + "grad_norm": 2.152921676635742, + "learning_rate": 1.1006785144995127e-05, + "loss": 1.1327, + "step": 1881 + }, + { + "epoch": 1.4244087038789026, + "grad_norm": 2.4787025451660156, + "learning_rate": 1.0998797736934743e-05, + "loss": 1.1633, + "step": 1882 + }, + { + "epoch": 1.4251655629139073, + "grad_norm": 2.7934088706970215, + "learning_rate": 1.0990809685709149e-05, + "loss": 1.1831, + "step": 1883 + }, + { + "epoch": 1.425922421948912, + "grad_norm": 2.058727502822876, + "learning_rate": 1.0982820996467334e-05, + "loss": 1.1262, + "step": 1884 + }, + { + "epoch": 1.4266792809839166, + "grad_norm": 2.1673519611358643, + "learning_rate": 1.0974831674358674e-05, + "loss": 1.1478, + "step": 1885 + }, + { + "epoch": 1.4274361400189215, + "grad_norm": 2.481576442718506, + "learning_rate": 1.0966841724532966e-05, + "loss": 1.1166, + "step": 1886 + }, + { + "epoch": 1.4281929990539262, + "grad_norm": 2.131117820739746, + "learning_rate": 1.0958851152140413e-05, + "loss": 1.084, + "step": 1887 + }, + { + "epoch": 1.4289498580889308, + "grad_norm": 2.3017077445983887, + "learning_rate": 1.095085996233162e-05, + "loss": 1.1406, + "step": 1888 + }, + { + "epoch": 1.4297067171239357, + "grad_norm": 2.4619855880737305, + "learning_rate": 1.0942868160257574e-05, + "loss": 1.1287, + "step": 1889 + }, + { + "epoch": 1.4304635761589404, + "grad_norm": 2.3940885066986084, + "learning_rate": 1.0934875751069679e-05, + "loss": 1.1135, + "step": 1890 + }, + { + "epoch": 1.431220435193945, + "grad_norm": 2.3564260005950928, + "learning_rate": 1.0926882739919718e-05, + "loss": 1.1474, + "step": 1891 + }, + { + "epoch": 1.43197729422895, + "grad_norm": 2.1630711555480957, + "learning_rate": 1.091888913195986e-05, + "loss": 1.1622, + "step": 1892 + }, + { + "epoch": 1.4327341532639546, + "grad_norm": 2.173370838165283, + "learning_rate": 1.0910894932342666e-05, + "loss": 1.1384, + "step": 1893 + }, + { + "epoch": 1.4334910122989593, + "grad_norm": 2.09320068359375, + "learning_rate": 1.0902900146221075e-05, + "loss": 1.1625, + "step": 1894 + }, + { + "epoch": 1.4342478713339641, + "grad_norm": 2.508751630783081, + "learning_rate": 1.0894904778748406e-05, + "loss": 1.1457, + "step": 1895 + }, + { + "epoch": 1.4350047303689688, + "grad_norm": 2.234450578689575, + "learning_rate": 1.0886908835078349e-05, + "loss": 1.1785, + "step": 1896 + }, + { + "epoch": 1.4357615894039735, + "grad_norm": 2.431640148162842, + "learning_rate": 1.0878912320364962e-05, + "loss": 1.1465, + "step": 1897 + }, + { + "epoch": 1.4365184484389784, + "grad_norm": 2.068406581878662, + "learning_rate": 1.087091523976269e-05, + "loss": 1.1386, + "step": 1898 + }, + { + "epoch": 1.437275307473983, + "grad_norm": 2.1216137409210205, + "learning_rate": 1.0862917598426315e-05, + "loss": 1.1177, + "step": 1899 + }, + { + "epoch": 1.4380321665089877, + "grad_norm": 2.36860990524292, + "learning_rate": 1.0854919401511002e-05, + "loss": 1.1612, + "step": 1900 + }, + { + "epoch": 1.4387890255439924, + "grad_norm": 2.2473835945129395, + "learning_rate": 1.0846920654172264e-05, + "loss": 1.1123, + "step": 1901 + }, + { + "epoch": 1.4395458845789972, + "grad_norm": 2.1833431720733643, + "learning_rate": 1.0838921361565978e-05, + "loss": 1.133, + "step": 1902 + }, + { + "epoch": 1.440302743614002, + "grad_norm": 2.078742742538452, + "learning_rate": 1.0830921528848355e-05, + "loss": 1.1634, + "step": 1903 + }, + { + "epoch": 1.4410596026490066, + "grad_norm": 2.1691906452178955, + "learning_rate": 1.0822921161175974e-05, + "loss": 1.1557, + "step": 1904 + }, + { + "epoch": 1.4418164616840112, + "grad_norm": 2.200441598892212, + "learning_rate": 1.0814920263705746e-05, + "loss": 1.1438, + "step": 1905 + }, + { + "epoch": 1.4425733207190161, + "grad_norm": 2.1800084114074707, + "learning_rate": 1.0806918841594929e-05, + "loss": 1.1395, + "step": 1906 + }, + { + "epoch": 1.4433301797540208, + "grad_norm": 2.34407901763916, + "learning_rate": 1.0798916900001117e-05, + "loss": 1.1448, + "step": 1907 + }, + { + "epoch": 1.4440870387890254, + "grad_norm": 2.1149091720581055, + "learning_rate": 1.0790914444082244e-05, + "loss": 1.1664, + "step": 1908 + }, + { + "epoch": 1.4448438978240303, + "grad_norm": 2.3421874046325684, + "learning_rate": 1.0782911478996559e-05, + "loss": 1.1109, + "step": 1909 + }, + { + "epoch": 1.445600756859035, + "grad_norm": 2.139888286590576, + "learning_rate": 1.0774908009902663e-05, + "loss": 1.1852, + "step": 1910 + }, + { + "epoch": 1.4463576158940397, + "grad_norm": 2.1743266582489014, + "learning_rate": 1.0766904041959465e-05, + "loss": 1.0994, + "step": 1911 + }, + { + "epoch": 1.4471144749290445, + "grad_norm": 2.0239481925964355, + "learning_rate": 1.0758899580326203e-05, + "loss": 1.0712, + "step": 1912 + }, + { + "epoch": 1.4478713339640492, + "grad_norm": 2.5557572841644287, + "learning_rate": 1.0750894630162429e-05, + "loss": 1.0855, + "step": 1913 + }, + { + "epoch": 1.4486281929990539, + "grad_norm": 2.1770548820495605, + "learning_rate": 1.0742889196628014e-05, + "loss": 1.1541, + "step": 1914 + }, + { + "epoch": 1.4493850520340588, + "grad_norm": 2.065044641494751, + "learning_rate": 1.073488328488314e-05, + "loss": 1.0722, + "step": 1915 + }, + { + "epoch": 1.4501419110690634, + "grad_norm": 2.274731159210205, + "learning_rate": 1.0726876900088287e-05, + "loss": 1.1562, + "step": 1916 + }, + { + "epoch": 1.450898770104068, + "grad_norm": 2.2915658950805664, + "learning_rate": 1.0718870047404253e-05, + "loss": 1.1573, + "step": 1917 + }, + { + "epoch": 1.451655629139073, + "grad_norm": 2.391997814178467, + "learning_rate": 1.0710862731992138e-05, + "loss": 1.1434, + "step": 1918 + }, + { + "epoch": 1.4524124881740776, + "grad_norm": 2.3179776668548584, + "learning_rate": 1.0702854959013332e-05, + "loss": 1.1352, + "step": 1919 + }, + { + "epoch": 1.4531693472090823, + "grad_norm": 2.223360776901245, + "learning_rate": 1.0694846733629519e-05, + "loss": 1.1152, + "step": 1920 + }, + { + "epoch": 1.453926206244087, + "grad_norm": 2.222038984298706, + "learning_rate": 1.0686838061002684e-05, + "loss": 1.0796, + "step": 1921 + }, + { + "epoch": 1.4546830652790916, + "grad_norm": 2.1372921466827393, + "learning_rate": 1.0678828946295099e-05, + "loss": 1.1047, + "step": 1922 + }, + { + "epoch": 1.4554399243140965, + "grad_norm": 2.1707942485809326, + "learning_rate": 1.0670819394669308e-05, + "loss": 1.1509, + "step": 1923 + }, + { + "epoch": 1.4561967833491012, + "grad_norm": 2.112736463546753, + "learning_rate": 1.066280941128815e-05, + "loss": 1.1266, + "step": 1924 + }, + { + "epoch": 1.4569536423841059, + "grad_norm": 2.3490540981292725, + "learning_rate": 1.065479900131474e-05, + "loss": 1.1621, + "step": 1925 + }, + { + "epoch": 1.4577105014191107, + "grad_norm": 2.17901873588562, + "learning_rate": 1.0646788169912465e-05, + "loss": 1.0735, + "step": 1926 + }, + { + "epoch": 1.4584673604541154, + "grad_norm": 2.0860230922698975, + "learning_rate": 1.0638776922244982e-05, + "loss": 1.1362, + "step": 1927 + }, + { + "epoch": 1.45922421948912, + "grad_norm": 2.0391974449157715, + "learning_rate": 1.0630765263476221e-05, + "loss": 1.1316, + "step": 1928 + }, + { + "epoch": 1.459981078524125, + "grad_norm": 2.0687365531921387, + "learning_rate": 1.062275319877038e-05, + "loss": 1.128, + "step": 1929 + }, + { + "epoch": 1.4607379375591296, + "grad_norm": 2.0217580795288086, + "learning_rate": 1.0614740733291902e-05, + "loss": 1.1377, + "step": 1930 + }, + { + "epoch": 1.4614947965941343, + "grad_norm": 2.0296125411987305, + "learning_rate": 1.060672787220551e-05, + "loss": 1.1236, + "step": 1931 + }, + { + "epoch": 1.4622516556291392, + "grad_norm": 2.0273191928863525, + "learning_rate": 1.0598714620676171e-05, + "loss": 1.1271, + "step": 1932 + }, + { + "epoch": 1.4630085146641438, + "grad_norm": 2.011613130569458, + "learning_rate": 1.05907009838691e-05, + "loss": 1.1457, + "step": 1933 + }, + { + "epoch": 1.4637653736991485, + "grad_norm": 2.0048105716705322, + "learning_rate": 1.058268696694977e-05, + "loss": 1.1499, + "step": 1934 + }, + { + "epoch": 1.4645222327341534, + "grad_norm": 2.086610794067383, + "learning_rate": 1.0574672575083891e-05, + "loss": 1.1376, + "step": 1935 + }, + { + "epoch": 1.465279091769158, + "grad_norm": 2.2125232219696045, + "learning_rate": 1.0566657813437419e-05, + "loss": 1.1103, + "step": 1936 + }, + { + "epoch": 1.4660359508041627, + "grad_norm": 2.172860622406006, + "learning_rate": 1.0558642687176548e-05, + "loss": 1.1306, + "step": 1937 + }, + { + "epoch": 1.4667928098391674, + "grad_norm": 2.1361825466156006, + "learning_rate": 1.0550627201467702e-05, + "loss": 1.0978, + "step": 1938 + }, + { + "epoch": 1.4675496688741723, + "grad_norm": 2.0148260593414307, + "learning_rate": 1.0542611361477548e-05, + "loss": 1.0851, + "step": 1939 + }, + { + "epoch": 1.468306527909177, + "grad_norm": 2.103895664215088, + "learning_rate": 1.0534595172372967e-05, + "loss": 1.1197, + "step": 1940 + }, + { + "epoch": 1.4690633869441816, + "grad_norm": 2.1808462142944336, + "learning_rate": 1.0526578639321078e-05, + "loss": 1.1192, + "step": 1941 + }, + { + "epoch": 1.4698202459791863, + "grad_norm": 2.2360849380493164, + "learning_rate": 1.0518561767489211e-05, + "loss": 1.1142, + "step": 1942 + }, + { + "epoch": 1.4705771050141911, + "grad_norm": 2.243360996246338, + "learning_rate": 1.0510544562044925e-05, + "loss": 1.2133, + "step": 1943 + }, + { + "epoch": 1.4713339640491958, + "grad_norm": 2.07759690284729, + "learning_rate": 1.050252702815598e-05, + "loss": 1.1227, + "step": 1944 + }, + { + "epoch": 1.4720908230842005, + "grad_norm": 2.0380797386169434, + "learning_rate": 1.0494509170990362e-05, + "loss": 1.0894, + "step": 1945 + }, + { + "epoch": 1.4728476821192054, + "grad_norm": 2.184549570083618, + "learning_rate": 1.0486490995716264e-05, + "loss": 1.1314, + "step": 1946 + }, + { + "epoch": 1.47360454115421, + "grad_norm": 2.1510207653045654, + "learning_rate": 1.0478472507502069e-05, + "loss": 1.1688, + "step": 1947 + }, + { + "epoch": 1.4743614001892147, + "grad_norm": 2.1699905395507812, + "learning_rate": 1.0470453711516377e-05, + "loss": 1.1374, + "step": 1948 + }, + { + "epoch": 1.4751182592242196, + "grad_norm": 2.0163750648498535, + "learning_rate": 1.0462434612927984e-05, + "loss": 1.1469, + "step": 1949 + }, + { + "epoch": 1.4758751182592242, + "grad_norm": 2.176668882369995, + "learning_rate": 1.0454415216905875e-05, + "loss": 1.154, + "step": 1950 + }, + { + "epoch": 1.476631977294229, + "grad_norm": 2.18507981300354, + "learning_rate": 1.0446395528619236e-05, + "loss": 1.1175, + "step": 1951 + }, + { + "epoch": 1.4773888363292338, + "grad_norm": 2.033001661300659, + "learning_rate": 1.0438375553237428e-05, + "loss": 1.1129, + "step": 1952 + }, + { + "epoch": 1.4781456953642385, + "grad_norm": 2.0419886112213135, + "learning_rate": 1.0430355295930008e-05, + "loss": 1.1455, + "step": 1953 + }, + { + "epoch": 1.4789025543992431, + "grad_norm": 2.083308696746826, + "learning_rate": 1.0422334761866715e-05, + "loss": 1.1069, + "step": 1954 + }, + { + "epoch": 1.479659413434248, + "grad_norm": 2.0463309288024902, + "learning_rate": 1.0414313956217456e-05, + "loss": 1.1456, + "step": 1955 + }, + { + "epoch": 1.4804162724692527, + "grad_norm": 2.0065863132476807, + "learning_rate": 1.0406292884152327e-05, + "loss": 1.0829, + "step": 1956 + }, + { + "epoch": 1.4811731315042573, + "grad_norm": 1.8798035383224487, + "learning_rate": 1.0398271550841586e-05, + "loss": 1.1378, + "step": 1957 + }, + { + "epoch": 1.481929990539262, + "grad_norm": 2.483062267303467, + "learning_rate": 1.0390249961455658e-05, + "loss": 1.0775, + "step": 1958 + }, + { + "epoch": 1.4826868495742667, + "grad_norm": 1.995613694190979, + "learning_rate": 1.0382228121165146e-05, + "loss": 1.1936, + "step": 1959 + }, + { + "epoch": 1.4834437086092715, + "grad_norm": 2.1545281410217285, + "learning_rate": 1.03742060351408e-05, + "loss": 1.1802, + "step": 1960 + }, + { + "epoch": 1.4842005676442762, + "grad_norm": 2.1138501167297363, + "learning_rate": 1.0366183708553532e-05, + "loss": 1.102, + "step": 1961 + }, + { + "epoch": 1.4849574266792809, + "grad_norm": 2.1736159324645996, + "learning_rate": 1.0358161146574417e-05, + "loss": 1.1844, + "step": 1962 + }, + { + "epoch": 1.4857142857142858, + "grad_norm": 2.0476620197296143, + "learning_rate": 1.0350138354374675e-05, + "loss": 1.1117, + "step": 1963 + }, + { + "epoch": 1.4864711447492904, + "grad_norm": 2.070690631866455, + "learning_rate": 1.034211533712567e-05, + "loss": 1.0858, + "step": 1964 + }, + { + "epoch": 1.487228003784295, + "grad_norm": 2.256793975830078, + "learning_rate": 1.0334092099998926e-05, + "loss": 1.1564, + "step": 1965 + }, + { + "epoch": 1.4879848628193, + "grad_norm": 1.8769042491912842, + "learning_rate": 1.0326068648166088e-05, + "loss": 1.1211, + "step": 1966 + }, + { + "epoch": 1.4887417218543046, + "grad_norm": 2.271409749984741, + "learning_rate": 1.0318044986798961e-05, + "loss": 1.1329, + "step": 1967 + }, + { + "epoch": 1.4894985808893093, + "grad_norm": 2.035731315612793, + "learning_rate": 1.031002112106947e-05, + "loss": 1.0566, + "step": 1968 + }, + { + "epoch": 1.4902554399243142, + "grad_norm": 1.9846116304397583, + "learning_rate": 1.0301997056149678e-05, + "loss": 1.1373, + "step": 1969 + }, + { + "epoch": 1.4910122989593189, + "grad_norm": 2.304295301437378, + "learning_rate": 1.0293972797211774e-05, + "loss": 1.1098, + "step": 1970 + }, + { + "epoch": 1.4917691579943235, + "grad_norm": 2.189412832260132, + "learning_rate": 1.028594834942807e-05, + "loss": 1.1026, + "step": 1971 + }, + { + "epoch": 1.4925260170293284, + "grad_norm": 2.1527864933013916, + "learning_rate": 1.0277923717971006e-05, + "loss": 1.1262, + "step": 1972 + }, + { + "epoch": 1.493282876064333, + "grad_norm": 2.20159912109375, + "learning_rate": 1.026989890801314e-05, + "loss": 1.092, + "step": 1973 + }, + { + "epoch": 1.4940397350993377, + "grad_norm": 2.2014966011047363, + "learning_rate": 1.0261873924727138e-05, + "loss": 1.1267, + "step": 1974 + }, + { + "epoch": 1.4947965941343424, + "grad_norm": 2.194817304611206, + "learning_rate": 1.0253848773285778e-05, + "loss": 1.1565, + "step": 1975 + }, + { + "epoch": 1.4955534531693473, + "grad_norm": 2.061915636062622, + "learning_rate": 1.0245823458861958e-05, + "loss": 1.1291, + "step": 1976 + }, + { + "epoch": 1.496310312204352, + "grad_norm": 2.1642725467681885, + "learning_rate": 1.0237797986628672e-05, + "loss": 1.1161, + "step": 1977 + }, + { + "epoch": 1.4970671712393566, + "grad_norm": 2.0526773929595947, + "learning_rate": 1.022977236175901e-05, + "loss": 1.1583, + "step": 1978 + }, + { + "epoch": 1.4978240302743613, + "grad_norm": 2.0349247455596924, + "learning_rate": 1.0221746589426176e-05, + "loss": 1.1161, + "step": 1979 + }, + { + "epoch": 1.4985808893093662, + "grad_norm": 2.1763689517974854, + "learning_rate": 1.0213720674803458e-05, + "loss": 1.1344, + "step": 1980 + }, + { + "epoch": 1.4993377483443708, + "grad_norm": 2.139963150024414, + "learning_rate": 1.0205694623064236e-05, + "loss": 1.2086, + "step": 1981 + }, + { + "epoch": 1.5000946073793755, + "grad_norm": 2.633737564086914, + "learning_rate": 1.0197668439381978e-05, + "loss": 1.1523, + "step": 1982 + }, + { + "epoch": 1.5008514664143804, + "grad_norm": 2.0594277381896973, + "learning_rate": 1.0189642128930246e-05, + "loss": 1.1436, + "step": 1983 + }, + { + "epoch": 1.501608325449385, + "grad_norm": 2.1511809825897217, + "learning_rate": 1.0181615696882676e-05, + "loss": 1.1195, + "step": 1984 + }, + { + "epoch": 1.5023651844843897, + "grad_norm": 1.992146372795105, + "learning_rate": 1.0173589148412981e-05, + "loss": 1.1534, + "step": 1985 + }, + { + "epoch": 1.5031220435193946, + "grad_norm": 2.000650644302368, + "learning_rate": 1.0165562488694953e-05, + "loss": 1.1158, + "step": 1986 + }, + { + "epoch": 1.5038789025543993, + "grad_norm": 2.0944910049438477, + "learning_rate": 1.0157535722902456e-05, + "loss": 1.0991, + "step": 1987 + }, + { + "epoch": 1.504635761589404, + "grad_norm": 2.3380539417266846, + "learning_rate": 1.0149508856209416e-05, + "loss": 1.148, + "step": 1988 + }, + { + "epoch": 1.5053926206244088, + "grad_norm": 2.1337814331054688, + "learning_rate": 1.014148189378983e-05, + "loss": 1.1508, + "step": 1989 + }, + { + "epoch": 1.5061494796594135, + "grad_norm": 2.069946765899658, + "learning_rate": 1.0133454840817765e-05, + "loss": 1.1449, + "step": 1990 + }, + { + "epoch": 1.5069063386944181, + "grad_norm": 2.4194324016571045, + "learning_rate": 1.0125427702467327e-05, + "loss": 1.1833, + "step": 1991 + }, + { + "epoch": 1.507663197729423, + "grad_norm": 2.0037777423858643, + "learning_rate": 1.0117400483912687e-05, + "loss": 1.1053, + "step": 1992 + }, + { + "epoch": 1.5084200567644275, + "grad_norm": 1.9638372659683228, + "learning_rate": 1.010937319032807e-05, + "loss": 1.1249, + "step": 1993 + }, + { + "epoch": 1.5091769157994324, + "grad_norm": 2.185102939605713, + "learning_rate": 1.0101345826887752e-05, + "loss": 1.1369, + "step": 1994 + }, + { + "epoch": 1.5099337748344372, + "grad_norm": 2.193578004837036, + "learning_rate": 1.0093318398766042e-05, + "loss": 1.1268, + "step": 1995 + }, + { + "epoch": 1.5106906338694417, + "grad_norm": 2.1746068000793457, + "learning_rate": 1.0085290911137298e-05, + "loss": 1.1316, + "step": 1996 + }, + { + "epoch": 1.5114474929044466, + "grad_norm": 2.308969736099243, + "learning_rate": 1.0077263369175918e-05, + "loss": 1.11, + "step": 1997 + }, + { + "epoch": 1.5122043519394512, + "grad_norm": 2.2050511837005615, + "learning_rate": 1.0069235778056336e-05, + "loss": 1.1363, + "step": 1998 + }, + { + "epoch": 1.512961210974456, + "grad_norm": 2.351792812347412, + "learning_rate": 1.0061208142953012e-05, + "loss": 1.1222, + "step": 1999 + }, + { + "epoch": 1.5137180700094608, + "grad_norm": 2.144644021987915, + "learning_rate": 1.0053180469040433e-05, + "loss": 1.0997, + "step": 2000 + }, + { + "epoch": 1.5144749290444655, + "grad_norm": 2.1637988090515137, + "learning_rate": 1.0045152761493127e-05, + "loss": 1.0968, + "step": 2001 + }, + { + "epoch": 1.5152317880794701, + "grad_norm": 2.200721502304077, + "learning_rate": 1.0037125025485616e-05, + "loss": 1.1016, + "step": 2002 + }, + { + "epoch": 1.515988647114475, + "grad_norm": 2.3035366535186768, + "learning_rate": 1.0029097266192467e-05, + "loss": 1.1659, + "step": 2003 + }, + { + "epoch": 1.5167455061494797, + "grad_norm": 2.34773588180542, + "learning_rate": 1.0021069488788253e-05, + "loss": 1.0888, + "step": 2004 + }, + { + "epoch": 1.5175023651844843, + "grad_norm": 2.268134117126465, + "learning_rate": 1.0013041698447547e-05, + "loss": 1.1519, + "step": 2005 + }, + { + "epoch": 1.5182592242194892, + "grad_norm": 2.331434726715088, + "learning_rate": 1.000501390034495e-05, + "loss": 1.1335, + "step": 2006 + }, + { + "epoch": 1.5190160832544939, + "grad_norm": 2.3400261402130127, + "learning_rate": 9.996986099655052e-06, + "loss": 1.1808, + "step": 2007 + }, + { + "epoch": 1.5197729422894986, + "grad_norm": 2.348576068878174, + "learning_rate": 9.988958301552454e-06, + "loss": 1.1358, + "step": 2008 + }, + { + "epoch": 1.5205298013245034, + "grad_norm": 2.131770610809326, + "learning_rate": 9.980930511211751e-06, + "loss": 1.0952, + "step": 2009 + }, + { + "epoch": 1.5212866603595079, + "grad_norm": 2.3337466716766357, + "learning_rate": 9.972902733807532e-06, + "loss": 1.1449, + "step": 2010 + }, + { + "epoch": 1.5220435193945128, + "grad_norm": 2.0936343669891357, + "learning_rate": 9.964874974514386e-06, + "loss": 1.1176, + "step": 2011 + }, + { + "epoch": 1.5228003784295177, + "grad_norm": 2.0963313579559326, + "learning_rate": 9.95684723850688e-06, + "loss": 1.1481, + "step": 2012 + }, + { + "epoch": 1.523557237464522, + "grad_norm": 2.055452823638916, + "learning_rate": 9.948819530959566e-06, + "loss": 1.1008, + "step": 2013 + }, + { + "epoch": 1.524314096499527, + "grad_norm": 2.257266044616699, + "learning_rate": 9.94079185704699e-06, + "loss": 1.1544, + "step": 2014 + }, + { + "epoch": 1.5250709555345316, + "grad_norm": 2.06075119972229, + "learning_rate": 9.932764221943666e-06, + "loss": 1.1153, + "step": 2015 + }, + { + "epoch": 1.5258278145695363, + "grad_norm": 2.2544174194335938, + "learning_rate": 9.924736630824083e-06, + "loss": 1.0718, + "step": 2016 + }, + { + "epoch": 1.5265846736045412, + "grad_norm": 2.1009559631347656, + "learning_rate": 9.916709088862707e-06, + "loss": 1.137, + "step": 2017 + }, + { + "epoch": 1.5273415326395459, + "grad_norm": 2.014848470687866, + "learning_rate": 9.908681601233964e-06, + "loss": 1.1268, + "step": 2018 + }, + { + "epoch": 1.5280983916745505, + "grad_norm": 2.2673892974853516, + "learning_rate": 9.900654173112251e-06, + "loss": 1.1528, + "step": 2019 + }, + { + "epoch": 1.5288552507095554, + "grad_norm": 2.206071138381958, + "learning_rate": 9.89262680967193e-06, + "loss": 1.1013, + "step": 2020 + }, + { + "epoch": 1.52961210974456, + "grad_norm": 2.102032423019409, + "learning_rate": 9.884599516087314e-06, + "loss": 1.105, + "step": 2021 + }, + { + "epoch": 1.5303689687795647, + "grad_norm": 2.1362051963806152, + "learning_rate": 9.876572297532677e-06, + "loss": 1.1479, + "step": 2022 + }, + { + "epoch": 1.5311258278145696, + "grad_norm": 2.0666024684906006, + "learning_rate": 9.868545159182238e-06, + "loss": 1.1257, + "step": 2023 + }, + { + "epoch": 1.5318826868495743, + "grad_norm": 1.9618515968322754, + "learning_rate": 9.860518106210167e-06, + "loss": 1.0649, + "step": 2024 + }, + { + "epoch": 1.532639545884579, + "grad_norm": 2.202753782272339, + "learning_rate": 9.852491143790587e-06, + "loss": 1.1016, + "step": 2025 + }, + { + "epoch": 1.5333964049195838, + "grad_norm": 1.9656624794006348, + "learning_rate": 9.844464277097549e-06, + "loss": 1.1435, + "step": 2026 + }, + { + "epoch": 1.5341532639545885, + "grad_norm": 2.06479811668396, + "learning_rate": 9.83643751130505e-06, + "loss": 1.1393, + "step": 2027 + }, + { + "epoch": 1.5349101229895932, + "grad_norm": 2.1722230911254883, + "learning_rate": 9.828410851587023e-06, + "loss": 1.1587, + "step": 2028 + }, + { + "epoch": 1.535666982024598, + "grad_norm": 2.090200901031494, + "learning_rate": 9.820384303117328e-06, + "loss": 1.109, + "step": 2029 + }, + { + "epoch": 1.5364238410596025, + "grad_norm": 1.9950278997421265, + "learning_rate": 9.812357871069754e-06, + "loss": 1.0648, + "step": 2030 + }, + { + "epoch": 1.5371807000946074, + "grad_norm": 2.141153573989868, + "learning_rate": 9.804331560618023e-06, + "loss": 1.1327, + "step": 2031 + }, + { + "epoch": 1.5379375591296123, + "grad_norm": 1.9659839868545532, + "learning_rate": 9.79630537693577e-06, + "loss": 1.1194, + "step": 2032 + }, + { + "epoch": 1.5386944181646167, + "grad_norm": 2.187727212905884, + "learning_rate": 9.788279325196547e-06, + "loss": 1.1225, + "step": 2033 + }, + { + "epoch": 1.5394512771996216, + "grad_norm": 2.4570298194885254, + "learning_rate": 9.780253410573827e-06, + "loss": 1.1252, + "step": 2034 + }, + { + "epoch": 1.5402081362346263, + "grad_norm": 2.2557146549224854, + "learning_rate": 9.772227638240993e-06, + "loss": 1.1698, + "step": 2035 + }, + { + "epoch": 1.540964995269631, + "grad_norm": 2.384152889251709, + "learning_rate": 9.764202013371333e-06, + "loss": 1.1447, + "step": 2036 + }, + { + "epoch": 1.5417218543046358, + "grad_norm": 2.0934481620788574, + "learning_rate": 9.756176541138045e-06, + "loss": 1.1429, + "step": 2037 + }, + { + "epoch": 1.5424787133396405, + "grad_norm": 2.418853282928467, + "learning_rate": 9.748151226714222e-06, + "loss": 1.1321, + "step": 2038 + }, + { + "epoch": 1.5432355723746451, + "grad_norm": 2.0321297645568848, + "learning_rate": 9.740126075272868e-06, + "loss": 1.0983, + "step": 2039 + }, + { + "epoch": 1.54399243140965, + "grad_norm": 2.088118076324463, + "learning_rate": 9.732101091986864e-06, + "loss": 1.1566, + "step": 2040 + }, + { + "epoch": 1.5447492904446547, + "grad_norm": 2.135477066040039, + "learning_rate": 9.724076282028993e-06, + "loss": 1.0886, + "step": 2041 + }, + { + "epoch": 1.5455061494796594, + "grad_norm": 2.1262335777282715, + "learning_rate": 9.716051650571933e-06, + "loss": 1.1461, + "step": 2042 + }, + { + "epoch": 1.5462630085146643, + "grad_norm": 2.2694787979125977, + "learning_rate": 9.708027202788229e-06, + "loss": 1.1725, + "step": 2043 + }, + { + "epoch": 1.547019867549669, + "grad_norm": 2.0884077548980713, + "learning_rate": 9.700002943850323e-06, + "loss": 1.1098, + "step": 2044 + }, + { + "epoch": 1.5477767265846736, + "grad_norm": 2.5522899627685547, + "learning_rate": 9.691978878930532e-06, + "loss": 1.0861, + "step": 2045 + }, + { + "epoch": 1.5485335856196785, + "grad_norm": 2.099339008331299, + "learning_rate": 9.68395501320104e-06, + "loss": 1.135, + "step": 2046 + }, + { + "epoch": 1.549290444654683, + "grad_norm": 2.0966038703918457, + "learning_rate": 9.675931351833911e-06, + "loss": 1.1468, + "step": 2047 + }, + { + "epoch": 1.5500473036896878, + "grad_norm": 1.972170352935791, + "learning_rate": 9.667907900001079e-06, + "loss": 1.0958, + "step": 2048 + }, + { + "epoch": 1.5508041627246927, + "grad_norm": 2.228671073913574, + "learning_rate": 9.659884662874332e-06, + "loss": 1.1338, + "step": 2049 + }, + { + "epoch": 1.5515610217596971, + "grad_norm": 1.9483565092086792, + "learning_rate": 9.65186164562533e-06, + "loss": 1.0757, + "step": 2050 + }, + { + "epoch": 1.552317880794702, + "grad_norm": 2.0136473178863525, + "learning_rate": 9.643838853425586e-06, + "loss": 1.1177, + "step": 2051 + }, + { + "epoch": 1.5530747398297067, + "grad_norm": 2.0868184566497803, + "learning_rate": 9.635816291446469e-06, + "loss": 1.1814, + "step": 2052 + }, + { + "epoch": 1.5538315988647113, + "grad_norm": 1.9814927577972412, + "learning_rate": 9.627793964859205e-06, + "loss": 1.1342, + "step": 2053 + }, + { + "epoch": 1.5545884578997162, + "grad_norm": 2.0822665691375732, + "learning_rate": 9.619771878834858e-06, + "loss": 1.0962, + "step": 2054 + }, + { + "epoch": 1.555345316934721, + "grad_norm": 2.0414929389953613, + "learning_rate": 9.611750038544343e-06, + "loss": 1.1178, + "step": 2055 + }, + { + "epoch": 1.5561021759697256, + "grad_norm": 2.134589672088623, + "learning_rate": 9.60372844915842e-06, + "loss": 1.133, + "step": 2056 + }, + { + "epoch": 1.5568590350047304, + "grad_norm": 2.3856427669525146, + "learning_rate": 9.595707115847676e-06, + "loss": 1.1252, + "step": 2057 + }, + { + "epoch": 1.557615894039735, + "grad_norm": 2.275172710418701, + "learning_rate": 9.587686043782545e-06, + "loss": 1.1195, + "step": 2058 + }, + { + "epoch": 1.5583727530747398, + "grad_norm": 2.2985713481903076, + "learning_rate": 9.579665238133291e-06, + "loss": 1.148, + "step": 2059 + }, + { + "epoch": 1.5591296121097447, + "grad_norm": 2.0514907836914062, + "learning_rate": 9.571644704069995e-06, + "loss": 1.1307, + "step": 2060 + }, + { + "epoch": 1.5598864711447493, + "grad_norm": 2.2182204723358154, + "learning_rate": 9.563624446762576e-06, + "loss": 1.0913, + "step": 2061 + }, + { + "epoch": 1.560643330179754, + "grad_norm": 2.628448486328125, + "learning_rate": 9.555604471380767e-06, + "loss": 1.1385, + "step": 2062 + }, + { + "epoch": 1.5614001892147589, + "grad_norm": 1.9690390825271606, + "learning_rate": 9.547584783094126e-06, + "loss": 1.1314, + "step": 2063 + }, + { + "epoch": 1.5621570482497635, + "grad_norm": 2.0425853729248047, + "learning_rate": 9.539565387072019e-06, + "loss": 1.1048, + "step": 2064 + }, + { + "epoch": 1.5629139072847682, + "grad_norm": 2.025308609008789, + "learning_rate": 9.531546288483624e-06, + "loss": 1.1012, + "step": 2065 + }, + { + "epoch": 1.563670766319773, + "grad_norm": 2.17830491065979, + "learning_rate": 9.523527492497934e-06, + "loss": 1.1926, + "step": 2066 + }, + { + "epoch": 1.5644276253547775, + "grad_norm": 3.244462251663208, + "learning_rate": 9.51550900428374e-06, + "loss": 1.0953, + "step": 2067 + }, + { + "epoch": 1.5651844843897824, + "grad_norm": 2.0593700408935547, + "learning_rate": 9.507490829009639e-06, + "loss": 1.1161, + "step": 2068 + }, + { + "epoch": 1.565941343424787, + "grad_norm": 2.0308477878570557, + "learning_rate": 9.49947297184402e-06, + "loss": 1.0959, + "step": 2069 + }, + { + "epoch": 1.5666982024597917, + "grad_norm": 2.1143085956573486, + "learning_rate": 9.491455437955081e-06, + "loss": 1.1541, + "step": 2070 + }, + { + "epoch": 1.5674550614947966, + "grad_norm": 2.3376524448394775, + "learning_rate": 9.483438232510792e-06, + "loss": 1.1283, + "step": 2071 + }, + { + "epoch": 1.5682119205298013, + "grad_norm": 2.194188117980957, + "learning_rate": 9.475421360678926e-06, + "loss": 1.1256, + "step": 2072 + }, + { + "epoch": 1.568968779564806, + "grad_norm": 2.12689208984375, + "learning_rate": 9.467404827627036e-06, + "loss": 1.1638, + "step": 2073 + }, + { + "epoch": 1.5697256385998108, + "grad_norm": 1.9852758646011353, + "learning_rate": 9.459388638522455e-06, + "loss": 1.1458, + "step": 2074 + }, + { + "epoch": 1.5704824976348155, + "grad_norm": 1.958489179611206, + "learning_rate": 9.4513727985323e-06, + "loss": 1.1222, + "step": 2075 + }, + { + "epoch": 1.5712393566698202, + "grad_norm": 2.1876025199890137, + "learning_rate": 9.443357312823454e-06, + "loss": 1.1322, + "step": 2076 + }, + { + "epoch": 1.571996215704825, + "grad_norm": 2.1041505336761475, + "learning_rate": 9.435342186562582e-06, + "loss": 1.1237, + "step": 2077 + }, + { + "epoch": 1.5727530747398297, + "grad_norm": 2.247180461883545, + "learning_rate": 9.427327424916113e-06, + "loss": 1.056, + "step": 2078 + }, + { + "epoch": 1.5735099337748344, + "grad_norm": 2.184521436691284, + "learning_rate": 9.419313033050232e-06, + "loss": 1.1022, + "step": 2079 + }, + { + "epoch": 1.5742667928098393, + "grad_norm": 2.078411340713501, + "learning_rate": 9.411299016130902e-06, + "loss": 1.1526, + "step": 2080 + }, + { + "epoch": 1.575023651844844, + "grad_norm": 2.154078245162964, + "learning_rate": 9.403285379323833e-06, + "loss": 1.2138, + "step": 2081 + }, + { + "epoch": 1.5757805108798486, + "grad_norm": 2.0813803672790527, + "learning_rate": 9.395272127794491e-06, + "loss": 1.0913, + "step": 2082 + }, + { + "epoch": 1.5765373699148535, + "grad_norm": 1.975311517715454, + "learning_rate": 9.387259266708104e-06, + "loss": 1.1674, + "step": 2083 + }, + { + "epoch": 1.577294228949858, + "grad_norm": 2.022935152053833, + "learning_rate": 9.379246801229626e-06, + "loss": 1.0664, + "step": 2084 + }, + { + "epoch": 1.5780510879848628, + "grad_norm": 2.3060450553894043, + "learning_rate": 9.371234736523781e-06, + "loss": 1.0884, + "step": 2085 + }, + { + "epoch": 1.5788079470198677, + "grad_norm": 2.224121570587158, + "learning_rate": 9.36322307775502e-06, + "loss": 1.1056, + "step": 2086 + }, + { + "epoch": 1.5795648060548722, + "grad_norm": 2.4794466495513916, + "learning_rate": 9.35521183008754e-06, + "loss": 1.1189, + "step": 2087 + }, + { + "epoch": 1.580321665089877, + "grad_norm": 2.0150938034057617, + "learning_rate": 9.347200998685261e-06, + "loss": 1.1063, + "step": 2088 + }, + { + "epoch": 1.5810785241248817, + "grad_norm": 2.3067104816436768, + "learning_rate": 9.339190588711852e-06, + "loss": 1.1081, + "step": 2089 + }, + { + "epoch": 1.5818353831598864, + "grad_norm": 2.071730613708496, + "learning_rate": 9.331180605330695e-06, + "loss": 1.1256, + "step": 2090 + }, + { + "epoch": 1.5825922421948913, + "grad_norm": 2.099440097808838, + "learning_rate": 9.323171053704904e-06, + "loss": 1.1306, + "step": 2091 + }, + { + "epoch": 1.583349101229896, + "grad_norm": 2.1519389152526855, + "learning_rate": 9.315161938997315e-06, + "loss": 1.1495, + "step": 2092 + }, + { + "epoch": 1.5841059602649006, + "grad_norm": 2.1621830463409424, + "learning_rate": 9.30715326637048e-06, + "loss": 1.1637, + "step": 2093 + }, + { + "epoch": 1.5848628192999055, + "grad_norm": 2.2661333084106445, + "learning_rate": 9.299145040986674e-06, + "loss": 1.1102, + "step": 2094 + }, + { + "epoch": 1.5856196783349101, + "grad_norm": 2.2131712436676025, + "learning_rate": 9.291137268007863e-06, + "loss": 1.1336, + "step": 2095 + }, + { + "epoch": 1.5863765373699148, + "grad_norm": 2.1026811599731445, + "learning_rate": 9.283129952595747e-06, + "loss": 1.0903, + "step": 2096 + }, + { + "epoch": 1.5871333964049197, + "grad_norm": 2.0890021324157715, + "learning_rate": 9.275123099911719e-06, + "loss": 1.1232, + "step": 2097 + }, + { + "epoch": 1.5878902554399243, + "grad_norm": 2.1274547576904297, + "learning_rate": 9.267116715116866e-06, + "loss": 1.1067, + "step": 2098 + }, + { + "epoch": 1.588647114474929, + "grad_norm": 2.193621873855591, + "learning_rate": 9.259110803371987e-06, + "loss": 1.1304, + "step": 2099 + }, + { + "epoch": 1.589403973509934, + "grad_norm": 2.1822807788848877, + "learning_rate": 9.251105369837574e-06, + "loss": 1.1568, + "step": 2100 + }, + { + "epoch": 1.5901608325449383, + "grad_norm": 1.883682131767273, + "learning_rate": 9.243100419673798e-06, + "loss": 1.129, + "step": 2101 + }, + { + "epoch": 1.5909176915799432, + "grad_norm": 2.0487449169158936, + "learning_rate": 9.235095958040535e-06, + "loss": 1.1436, + "step": 2102 + }, + { + "epoch": 1.5916745506149481, + "grad_norm": 2.079259157180786, + "learning_rate": 9.22709199009734e-06, + "loss": 1.1538, + "step": 2103 + }, + { + "epoch": 1.5924314096499526, + "grad_norm": 2.1335606575012207, + "learning_rate": 9.219088521003444e-06, + "loss": 1.1106, + "step": 2104 + }, + { + "epoch": 1.5931882686849574, + "grad_norm": 3.516350507736206, + "learning_rate": 9.211085555917764e-06, + "loss": 1.1, + "step": 2105 + }, + { + "epoch": 1.593945127719962, + "grad_norm": 2.226984739303589, + "learning_rate": 9.203083099998885e-06, + "loss": 1.149, + "step": 2106 + }, + { + "epoch": 1.5947019867549668, + "grad_norm": 2.139308452606201, + "learning_rate": 9.195081158405074e-06, + "loss": 1.1636, + "step": 2107 + }, + { + "epoch": 1.5954588457899717, + "grad_norm": 2.194244146347046, + "learning_rate": 9.187079736294258e-06, + "loss": 1.1441, + "step": 2108 + }, + { + "epoch": 1.5962157048249763, + "grad_norm": 2.349120855331421, + "learning_rate": 9.179078838824029e-06, + "loss": 1.1093, + "step": 2109 + }, + { + "epoch": 1.596972563859981, + "grad_norm": 2.0828843116760254, + "learning_rate": 9.171078471151646e-06, + "loss": 1.1357, + "step": 2110 + }, + { + "epoch": 1.5977294228949859, + "grad_norm": 2.10848331451416, + "learning_rate": 9.163078638434028e-06, + "loss": 1.1125, + "step": 2111 + }, + { + "epoch": 1.5984862819299905, + "grad_norm": 2.1101763248443604, + "learning_rate": 9.155079345827737e-06, + "loss": 1.0964, + "step": 2112 + }, + { + "epoch": 1.5992431409649952, + "grad_norm": 2.0837841033935547, + "learning_rate": 9.147080598488999e-06, + "loss": 1.1121, + "step": 2113 + }, + { + "epoch": 1.6, + "grad_norm": 1.9847743511199951, + "learning_rate": 9.13908240157369e-06, + "loss": 1.1153, + "step": 2114 + }, + { + "epoch": 1.6007568590350048, + "grad_norm": 2.0701873302459717, + "learning_rate": 9.131084760237314e-06, + "loss": 1.1046, + "step": 2115 + }, + { + "epoch": 1.6015137180700094, + "grad_norm": 2.1153228282928467, + "learning_rate": 9.123087679635039e-06, + "loss": 1.1126, + "step": 2116 + }, + { + "epoch": 1.6022705771050143, + "grad_norm": 2.0515284538269043, + "learning_rate": 9.115091164921654e-06, + "loss": 1.1266, + "step": 2117 + }, + { + "epoch": 1.603027436140019, + "grad_norm": 2.08406662940979, + "learning_rate": 9.107095221251597e-06, + "loss": 1.1355, + "step": 2118 + }, + { + "epoch": 1.6037842951750236, + "grad_norm": 2.6160190105438232, + "learning_rate": 9.099099853778927e-06, + "loss": 1.1226, + "step": 2119 + }, + { + "epoch": 1.6045411542100285, + "grad_norm": 2.024075984954834, + "learning_rate": 9.091105067657335e-06, + "loss": 1.0951, + "step": 2120 + }, + { + "epoch": 1.605298013245033, + "grad_norm": 1.9903373718261719, + "learning_rate": 9.083110868040142e-06, + "loss": 1.1485, + "step": 2121 + }, + { + "epoch": 1.6060548722800378, + "grad_norm": 2.307220458984375, + "learning_rate": 9.075117260080286e-06, + "loss": 1.1698, + "step": 2122 + }, + { + "epoch": 1.6068117313150427, + "grad_norm": 2.160867929458618, + "learning_rate": 9.067124248930324e-06, + "loss": 1.0852, + "step": 2123 + }, + { + "epoch": 1.6075685903500472, + "grad_norm": 2.255039691925049, + "learning_rate": 9.059131839742425e-06, + "loss": 1.1299, + "step": 2124 + }, + { + "epoch": 1.608325449385052, + "grad_norm": 2.122530698776245, + "learning_rate": 9.051140037668385e-06, + "loss": 1.1148, + "step": 2125 + }, + { + "epoch": 1.6090823084200567, + "grad_norm": 2.067059278488159, + "learning_rate": 9.043148847859588e-06, + "loss": 1.1335, + "step": 2126 + }, + { + "epoch": 1.6098391674550614, + "grad_norm": 3.3186850547790527, + "learning_rate": 9.035158275467037e-06, + "loss": 1.0892, + "step": 2127 + }, + { + "epoch": 1.6105960264900663, + "grad_norm": 2.4546923637390137, + "learning_rate": 9.02716832564133e-06, + "loss": 1.1939, + "step": 2128 + }, + { + "epoch": 1.611352885525071, + "grad_norm": 2.36734938621521, + "learning_rate": 9.01917900353267e-06, + "loss": 1.163, + "step": 2129 + }, + { + "epoch": 1.6121097445600756, + "grad_norm": 2.2100653648376465, + "learning_rate": 9.011190314290852e-06, + "loss": 1.0951, + "step": 2130 + }, + { + "epoch": 1.6128666035950805, + "grad_norm": 2.239097833633423, + "learning_rate": 9.003202263065263e-06, + "loss": 1.1554, + "step": 2131 + }, + { + "epoch": 1.6136234626300852, + "grad_norm": 2.2774319648742676, + "learning_rate": 8.995214855004877e-06, + "loss": 1.1237, + "step": 2132 + }, + { + "epoch": 1.6143803216650898, + "grad_norm": 2.1328752040863037, + "learning_rate": 8.987228095258256e-06, + "loss": 1.1154, + "step": 2133 + }, + { + "epoch": 1.6151371807000947, + "grad_norm": 2.3373916149139404, + "learning_rate": 8.979241988973546e-06, + "loss": 1.1058, + "step": 2134 + }, + { + "epoch": 1.6158940397350994, + "grad_norm": 2.126988172531128, + "learning_rate": 8.971256541298468e-06, + "loss": 1.1709, + "step": 2135 + }, + { + "epoch": 1.616650898770104, + "grad_norm": 2.3820157051086426, + "learning_rate": 8.963271757380319e-06, + "loss": 1.1332, + "step": 2136 + }, + { + "epoch": 1.617407757805109, + "grad_norm": 2.0995140075683594, + "learning_rate": 8.955287642365969e-06, + "loss": 1.1341, + "step": 2137 + }, + { + "epoch": 1.6181646168401134, + "grad_norm": 2.2463080883026123, + "learning_rate": 8.94730420140186e-06, + "loss": 1.1455, + "step": 2138 + }, + { + "epoch": 1.6189214758751183, + "grad_norm": 2.293729066848755, + "learning_rate": 8.939321439633991e-06, + "loss": 1.103, + "step": 2139 + }, + { + "epoch": 1.6196783349101231, + "grad_norm": 2.1475143432617188, + "learning_rate": 8.931339362207931e-06, + "loss": 1.124, + "step": 2140 + }, + { + "epoch": 1.6204351939451276, + "grad_norm": 2.087843179702759, + "learning_rate": 8.923357974268806e-06, + "loss": 1.1743, + "step": 2141 + }, + { + "epoch": 1.6211920529801325, + "grad_norm": 2.0908894538879395, + "learning_rate": 8.915377280961298e-06, + "loss": 1.0961, + "step": 2142 + }, + { + "epoch": 1.6219489120151371, + "grad_norm": 2.312263250350952, + "learning_rate": 8.907397287429635e-06, + "loss": 1.1523, + "step": 2143 + }, + { + "epoch": 1.6227057710501418, + "grad_norm": 2.30190110206604, + "learning_rate": 8.899417998817605e-06, + "loss": 1.1319, + "step": 2144 + }, + { + "epoch": 1.6234626300851467, + "grad_norm": 2.0427803993225098, + "learning_rate": 8.891439420268534e-06, + "loss": 1.1007, + "step": 2145 + }, + { + "epoch": 1.6242194891201513, + "grad_norm": 2.142066717147827, + "learning_rate": 8.88346155692529e-06, + "loss": 1.1465, + "step": 2146 + }, + { + "epoch": 1.624976348155156, + "grad_norm": 1.9997434616088867, + "learning_rate": 8.875484413930283e-06, + "loss": 1.0966, + "step": 2147 + }, + { + "epoch": 1.625733207190161, + "grad_norm": 2.0591166019439697, + "learning_rate": 8.86750799642546e-06, + "loss": 1.1409, + "step": 2148 + }, + { + "epoch": 1.6264900662251656, + "grad_norm": 2.044402599334717, + "learning_rate": 8.859532309552298e-06, + "loss": 1.1145, + "step": 2149 + }, + { + "epoch": 1.6272469252601702, + "grad_norm": 2.3767645359039307, + "learning_rate": 8.8515573584518e-06, + "loss": 1.1278, + "step": 2150 + }, + { + "epoch": 1.6280037842951751, + "grad_norm": 2.088170289993286, + "learning_rate": 8.843583148264496e-06, + "loss": 1.1607, + "step": 2151 + }, + { + "epoch": 1.6287606433301798, + "grad_norm": 2.026031255722046, + "learning_rate": 8.835609684130448e-06, + "loss": 1.1173, + "step": 2152 + }, + { + "epoch": 1.6295175023651844, + "grad_norm": 2.108065366744995, + "learning_rate": 8.827636971189222e-06, + "loss": 1.1735, + "step": 2153 + }, + { + "epoch": 1.6302743614001893, + "grad_norm": 2.12156081199646, + "learning_rate": 8.819665014579911e-06, + "loss": 1.0851, + "step": 2154 + }, + { + "epoch": 1.631031220435194, + "grad_norm": 2.0895984172821045, + "learning_rate": 8.81169381944112e-06, + "loss": 1.1261, + "step": 2155 + }, + { + "epoch": 1.6317880794701987, + "grad_norm": 2.119001865386963, + "learning_rate": 8.803723390910951e-06, + "loss": 1.1236, + "step": 2156 + }, + { + "epoch": 1.6325449385052035, + "grad_norm": 2.0929312705993652, + "learning_rate": 8.795753734127024e-06, + "loss": 1.157, + "step": 2157 + }, + { + "epoch": 1.633301797540208, + "grad_norm": 2.0275888442993164, + "learning_rate": 8.787784854226465e-06, + "loss": 1.1407, + "step": 2158 + }, + { + "epoch": 1.6340586565752129, + "grad_norm": 2.332402467727661, + "learning_rate": 8.779816756345884e-06, + "loss": 1.0916, + "step": 2159 + }, + { + "epoch": 1.6348155156102178, + "grad_norm": 2.0872373580932617, + "learning_rate": 8.7718494456214e-06, + "loss": 1.1492, + "step": 2160 + }, + { + "epoch": 1.6355723746452222, + "grad_norm": 2.1566085815429688, + "learning_rate": 8.763882927188615e-06, + "loss": 1.1397, + "step": 2161 + }, + { + "epoch": 1.636329233680227, + "grad_norm": 2.134572744369507, + "learning_rate": 8.75591720618263e-06, + "loss": 1.0967, + "step": 2162 + }, + { + "epoch": 1.6370860927152318, + "grad_norm": 2.061708450317383, + "learning_rate": 8.74795228773803e-06, + "loss": 1.0709, + "step": 2163 + }, + { + "epoch": 1.6378429517502364, + "grad_norm": 2.04203724861145, + "learning_rate": 8.739988176988869e-06, + "loss": 1.0671, + "step": 2164 + }, + { + "epoch": 1.6385998107852413, + "grad_norm": 2.0386204719543457, + "learning_rate": 8.732024879068702e-06, + "loss": 1.1021, + "step": 2165 + }, + { + "epoch": 1.639356669820246, + "grad_norm": 2.104109764099121, + "learning_rate": 8.724062399110547e-06, + "loss": 1.0964, + "step": 2166 + }, + { + "epoch": 1.6401135288552506, + "grad_norm": 2.078735113143921, + "learning_rate": 8.716100742246894e-06, + "loss": 1.1241, + "step": 2167 + }, + { + "epoch": 1.6408703878902555, + "grad_norm": 2.1530871391296387, + "learning_rate": 8.708139913609705e-06, + "loss": 1.118, + "step": 2168 + }, + { + "epoch": 1.6416272469252602, + "grad_norm": 2.1105563640594482, + "learning_rate": 8.700179918330419e-06, + "loss": 1.0883, + "step": 2169 + }, + { + "epoch": 1.6423841059602649, + "grad_norm": 2.056195020675659, + "learning_rate": 8.692220761539912e-06, + "loss": 1.1549, + "step": 2170 + }, + { + "epoch": 1.6431409649952697, + "grad_norm": 2.327533006668091, + "learning_rate": 8.684262448368546e-06, + "loss": 1.1097, + "step": 2171 + }, + { + "epoch": 1.6438978240302744, + "grad_norm": 2.111985206604004, + "learning_rate": 8.676304983946122e-06, + "loss": 1.1048, + "step": 2172 + }, + { + "epoch": 1.644654683065279, + "grad_norm": 2.1778697967529297, + "learning_rate": 8.668348373401908e-06, + "loss": 1.1644, + "step": 2173 + }, + { + "epoch": 1.645411542100284, + "grad_norm": 2.3045222759246826, + "learning_rate": 8.660392621864608e-06, + "loss": 1.1873, + "step": 2174 + }, + { + "epoch": 1.6461684011352884, + "grad_norm": 2.2667534351348877, + "learning_rate": 8.652437734462377e-06, + "loss": 1.0519, + "step": 2175 + }, + { + "epoch": 1.6469252601702933, + "grad_norm": 2.394404888153076, + "learning_rate": 8.644483716322818e-06, + "loss": 1.1324, + "step": 2176 + }, + { + "epoch": 1.6476821192052982, + "grad_norm": 2.009328603744507, + "learning_rate": 8.63653057257297e-06, + "loss": 1.147, + "step": 2177 + }, + { + "epoch": 1.6484389782403026, + "grad_norm": 2.072662591934204, + "learning_rate": 8.6285783083393e-06, + "loss": 1.1435, + "step": 2178 + }, + { + "epoch": 1.6491958372753075, + "grad_norm": 2.184267044067383, + "learning_rate": 8.620626928747725e-06, + "loss": 1.1896, + "step": 2179 + }, + { + "epoch": 1.6499526963103122, + "grad_norm": 2.0765388011932373, + "learning_rate": 8.612676438923587e-06, + "loss": 1.125, + "step": 2180 + }, + { + "epoch": 1.6507095553453168, + "grad_norm": 2.2719030380249023, + "learning_rate": 8.604726843991637e-06, + "loss": 1.074, + "step": 2181 + }, + { + "epoch": 1.6514664143803217, + "grad_norm": 2.0793111324310303, + "learning_rate": 8.596778149076073e-06, + "loss": 1.099, + "step": 2182 + }, + { + "epoch": 1.6522232734153264, + "grad_norm": 1.9967988729476929, + "learning_rate": 8.588830359300499e-06, + "loss": 1.1454, + "step": 2183 + }, + { + "epoch": 1.652980132450331, + "grad_norm": 2.3746399879455566, + "learning_rate": 8.580883479787936e-06, + "loss": 1.1424, + "step": 2184 + }, + { + "epoch": 1.653736991485336, + "grad_norm": 2.0627639293670654, + "learning_rate": 8.57293751566083e-06, + "loss": 1.1052, + "step": 2185 + }, + { + "epoch": 1.6544938505203406, + "grad_norm": 2.196162700653076, + "learning_rate": 8.564992472041021e-06, + "loss": 1.1002, + "step": 2186 + }, + { + "epoch": 1.6552507095553453, + "grad_norm": 2.3567469120025635, + "learning_rate": 8.557048354049763e-06, + "loss": 1.1426, + "step": 2187 + }, + { + "epoch": 1.6560075685903501, + "grad_norm": 2.0467660427093506, + "learning_rate": 8.549105166807716e-06, + "loss": 1.0916, + "step": 2188 + }, + { + "epoch": 1.6567644276253548, + "grad_norm": 2.107483386993408, + "learning_rate": 8.541162915434935e-06, + "loss": 1.1226, + "step": 2189 + }, + { + "epoch": 1.6575212866603595, + "grad_norm": 2.2864937782287598, + "learning_rate": 8.533221605050878e-06, + "loss": 1.1246, + "step": 2190 + }, + { + "epoch": 1.6582781456953644, + "grad_norm": 2.135864734649658, + "learning_rate": 8.525281240774391e-06, + "loss": 1.0364, + "step": 2191 + }, + { + "epoch": 1.659035004730369, + "grad_norm": 2.136951446533203, + "learning_rate": 8.517341827723709e-06, + "loss": 1.1753, + "step": 2192 + }, + { + "epoch": 1.6597918637653737, + "grad_norm": 2.1684775352478027, + "learning_rate": 8.509403371016462e-06, + "loss": 1.0812, + "step": 2193 + }, + { + "epoch": 1.6605487228003786, + "grad_norm": 2.195051431655884, + "learning_rate": 8.501465875769652e-06, + "loss": 1.0883, + "step": 2194 + }, + { + "epoch": 1.661305581835383, + "grad_norm": 2.2236487865448, + "learning_rate": 8.493529347099669e-06, + "loss": 1.0637, + "step": 2195 + }, + { + "epoch": 1.662062440870388, + "grad_norm": 2.1652839183807373, + "learning_rate": 8.48559379012228e-06, + "loss": 1.1285, + "step": 2196 + }, + { + "epoch": 1.6628192999053926, + "grad_norm": 2.3312926292419434, + "learning_rate": 8.477659209952627e-06, + "loss": 1.1303, + "step": 2197 + }, + { + "epoch": 1.6635761589403972, + "grad_norm": 2.1476340293884277, + "learning_rate": 8.46972561170521e-06, + "loss": 1.0911, + "step": 2198 + }, + { + "epoch": 1.6643330179754021, + "grad_norm": 2.205261707305908, + "learning_rate": 8.461793000493917e-06, + "loss": 1.1075, + "step": 2199 + }, + { + "epoch": 1.6650898770104068, + "grad_norm": 2.3584821224212646, + "learning_rate": 8.45386138143198e-06, + "loss": 1.1429, + "step": 2200 + }, + { + "epoch": 1.6658467360454114, + "grad_norm": 1.990213394165039, + "learning_rate": 8.445930759632e-06, + "loss": 1.0731, + "step": 2201 + }, + { + "epoch": 1.6666035950804163, + "grad_norm": 2.114382266998291, + "learning_rate": 8.43800114020594e-06, + "loss": 1.1304, + "step": 2202 + }, + { + "epoch": 1.667360454115421, + "grad_norm": 2.6425230503082275, + "learning_rate": 8.430072528265107e-06, + "loss": 1.1223, + "step": 2203 + }, + { + "epoch": 1.6681173131504257, + "grad_norm": 2.238675594329834, + "learning_rate": 8.422144928920168e-06, + "loss": 1.1187, + "step": 2204 + }, + { + "epoch": 1.6688741721854305, + "grad_norm": 2.0409348011016846, + "learning_rate": 8.414218347281127e-06, + "loss": 1.0912, + "step": 2205 + }, + { + "epoch": 1.6696310312204352, + "grad_norm": 2.5583693981170654, + "learning_rate": 8.406292788457338e-06, + "loss": 1.1433, + "step": 2206 + }, + { + "epoch": 1.6703878902554399, + "grad_norm": 2.24996018409729, + "learning_rate": 8.398368257557505e-06, + "loss": 1.1177, + "step": 2207 + }, + { + "epoch": 1.6711447492904448, + "grad_norm": 2.2110371589660645, + "learning_rate": 8.390444759689646e-06, + "loss": 1.1334, + "step": 2208 + }, + { + "epoch": 1.6719016083254494, + "grad_norm": 2.0102930068969727, + "learning_rate": 8.382522299961135e-06, + "loss": 1.0807, + "step": 2209 + }, + { + "epoch": 1.672658467360454, + "grad_norm": 2.319051504135132, + "learning_rate": 8.37460088347867e-06, + "loss": 1.0997, + "step": 2210 + }, + { + "epoch": 1.673415326395459, + "grad_norm": 2.3122832775115967, + "learning_rate": 8.36668051534827e-06, + "loss": 1.1591, + "step": 2211 + }, + { + "epoch": 1.6741721854304634, + "grad_norm": 2.39446759223938, + "learning_rate": 8.358761200675284e-06, + "loss": 1.1201, + "step": 2212 + }, + { + "epoch": 1.6749290444654683, + "grad_norm": 2.257894515991211, + "learning_rate": 8.350842944564386e-06, + "loss": 1.1094, + "step": 2213 + }, + { + "epoch": 1.6756859035004732, + "grad_norm": 2.2300925254821777, + "learning_rate": 8.34292575211956e-06, + "loss": 1.1613, + "step": 2214 + }, + { + "epoch": 1.6764427625354776, + "grad_norm": 2.2363715171813965, + "learning_rate": 8.33500962844411e-06, + "loss": 1.1485, + "step": 2215 + }, + { + "epoch": 1.6771996215704825, + "grad_norm": 2.0160231590270996, + "learning_rate": 8.327094578640643e-06, + "loss": 1.1136, + "step": 2216 + }, + { + "epoch": 1.6779564806054872, + "grad_norm": 2.279360294342041, + "learning_rate": 8.319180607811085e-06, + "loss": 1.1193, + "step": 2217 + }, + { + "epoch": 1.6787133396404919, + "grad_norm": 2.280641555786133, + "learning_rate": 8.31126772105666e-06, + "loss": 1.1655, + "step": 2218 + }, + { + "epoch": 1.6794701986754967, + "grad_norm": 2.077263832092285, + "learning_rate": 8.303355923477889e-06, + "loss": 1.1435, + "step": 2219 + }, + { + "epoch": 1.6802270577105014, + "grad_norm": 2.271101236343384, + "learning_rate": 8.295445220174604e-06, + "loss": 1.0986, + "step": 2220 + }, + { + "epoch": 1.680983916745506, + "grad_norm": 2.257680892944336, + "learning_rate": 8.28753561624592e-06, + "loss": 1.095, + "step": 2221 + }, + { + "epoch": 1.681740775780511, + "grad_norm": 2.18369722366333, + "learning_rate": 8.279627116790244e-06, + "loss": 1.1007, + "step": 2222 + }, + { + "epoch": 1.6824976348155156, + "grad_norm": 2.0165445804595947, + "learning_rate": 8.271719726905275e-06, + "loss": 1.1165, + "step": 2223 + }, + { + "epoch": 1.6832544938505203, + "grad_norm": 2.2388248443603516, + "learning_rate": 8.263813451688006e-06, + "loss": 1.1186, + "step": 2224 + }, + { + "epoch": 1.6840113528855252, + "grad_norm": 2.32745099067688, + "learning_rate": 8.255908296234688e-06, + "loss": 1.1812, + "step": 2225 + }, + { + "epoch": 1.6847682119205298, + "grad_norm": 2.361375093460083, + "learning_rate": 8.248004265640868e-06, + "loss": 1.1649, + "step": 2226 + }, + { + "epoch": 1.6855250709555345, + "grad_norm": 2.2414417266845703, + "learning_rate": 8.240101365001368e-06, + "loss": 1.1013, + "step": 2227 + }, + { + "epoch": 1.6862819299905394, + "grad_norm": 1.9859745502471924, + "learning_rate": 8.232199599410273e-06, + "loss": 1.1295, + "step": 2228 + }, + { + "epoch": 1.6870387890255438, + "grad_norm": 2.1757733821868896, + "learning_rate": 8.22429897396094e-06, + "loss": 1.1223, + "step": 2229 + }, + { + "epoch": 1.6877956480605487, + "grad_norm": 2.36989164352417, + "learning_rate": 8.216399493745992e-06, + "loss": 1.1337, + "step": 2230 + }, + { + "epoch": 1.6885525070955536, + "grad_norm": 1.9508718252182007, + "learning_rate": 8.208501163857318e-06, + "loss": 1.1351, + "step": 2231 + }, + { + "epoch": 1.689309366130558, + "grad_norm": 2.057548999786377, + "learning_rate": 8.200603989386055e-06, + "loss": 1.1382, + "step": 2232 + }, + { + "epoch": 1.690066225165563, + "grad_norm": 2.3355371952056885, + "learning_rate": 8.192707975422604e-06, + "loss": 1.1393, + "step": 2233 + }, + { + "epoch": 1.6908230842005676, + "grad_norm": 2.1525468826293945, + "learning_rate": 8.184813127056616e-06, + "loss": 1.1665, + "step": 2234 + }, + { + "epoch": 1.6915799432355723, + "grad_norm": 2.2342827320098877, + "learning_rate": 8.176919449376989e-06, + "loss": 1.1385, + "step": 2235 + }, + { + "epoch": 1.6923368022705771, + "grad_norm": 2.1949453353881836, + "learning_rate": 8.169026947471866e-06, + "loss": 1.1635, + "step": 2236 + }, + { + "epoch": 1.6930936613055818, + "grad_norm": 2.183218240737915, + "learning_rate": 8.161135626428633e-06, + "loss": 1.1433, + "step": 2237 + }, + { + "epoch": 1.6938505203405865, + "grad_norm": 2.1611173152923584, + "learning_rate": 8.153245491333922e-06, + "loss": 1.1083, + "step": 2238 + }, + { + "epoch": 1.6946073793755914, + "grad_norm": 2.0343189239501953, + "learning_rate": 8.145356547273584e-06, + "loss": 1.1334, + "step": 2239 + }, + { + "epoch": 1.695364238410596, + "grad_norm": 2.0303726196289062, + "learning_rate": 8.13746879933272e-06, + "loss": 1.1545, + "step": 2240 + }, + { + "epoch": 1.6961210974456007, + "grad_norm": 2.152381658554077, + "learning_rate": 8.129582252595645e-06, + "loss": 1.1316, + "step": 2241 + }, + { + "epoch": 1.6968779564806056, + "grad_norm": 2.0682196617126465, + "learning_rate": 8.12169691214591e-06, + "loss": 1.1396, + "step": 2242 + }, + { + "epoch": 1.6976348155156102, + "grad_norm": 2.07098388671875, + "learning_rate": 8.113812783066288e-06, + "loss": 1.0784, + "step": 2243 + }, + { + "epoch": 1.698391674550615, + "grad_norm": 2.1533405780792236, + "learning_rate": 8.105929870438762e-06, + "loss": 1.1151, + "step": 2244 + }, + { + "epoch": 1.6991485335856198, + "grad_norm": 2.348604679107666, + "learning_rate": 8.098048179344545e-06, + "loss": 1.0913, + "step": 2245 + }, + { + "epoch": 1.6999053926206245, + "grad_norm": 2.0196192264556885, + "learning_rate": 8.090167714864051e-06, + "loss": 1.1334, + "step": 2246 + }, + { + "epoch": 1.7006622516556291, + "grad_norm": 2.2407851219177246, + "learning_rate": 8.082288482076904e-06, + "loss": 1.1362, + "step": 2247 + }, + { + "epoch": 1.701419110690634, + "grad_norm": 2.0429224967956543, + "learning_rate": 8.074410486061943e-06, + "loss": 1.0377, + "step": 2248 + }, + { + "epoch": 1.7021759697256384, + "grad_norm": 2.149394989013672, + "learning_rate": 8.066533731897202e-06, + "loss": 1.1324, + "step": 2249 + }, + { + "epoch": 1.7029328287606433, + "grad_norm": 2.5104711055755615, + "learning_rate": 8.058658224659914e-06, + "loss": 1.1037, + "step": 2250 + }, + { + "epoch": 1.7036896877956482, + "grad_norm": 1.9572622776031494, + "learning_rate": 8.050783969426517e-06, + "loss": 1.1231, + "step": 2251 + }, + { + "epoch": 1.7044465468306527, + "grad_norm": 2.344362497329712, + "learning_rate": 8.042910971272627e-06, + "loss": 1.1054, + "step": 2252 + }, + { + "epoch": 1.7052034058656576, + "grad_norm": 2.160923719406128, + "learning_rate": 8.035039235273063e-06, + "loss": 1.1429, + "step": 2253 + }, + { + "epoch": 1.7059602649006622, + "grad_norm": 2.273373603820801, + "learning_rate": 8.027168766501831e-06, + "loss": 1.1073, + "step": 2254 + }, + { + "epoch": 1.7067171239356669, + "grad_norm": 2.043576955795288, + "learning_rate": 8.019299570032108e-06, + "loss": 1.1326, + "step": 2255 + }, + { + "epoch": 1.7074739829706718, + "grad_norm": 2.3227038383483887, + "learning_rate": 8.011431650936259e-06, + "loss": 1.0762, + "step": 2256 + }, + { + "epoch": 1.7082308420056764, + "grad_norm": 2.2618634700775146, + "learning_rate": 8.003565014285829e-06, + "loss": 1.1246, + "step": 2257 + }, + { + "epoch": 1.708987701040681, + "grad_norm": 2.072833776473999, + "learning_rate": 7.99569966515153e-06, + "loss": 1.128, + "step": 2258 + }, + { + "epoch": 1.709744560075686, + "grad_norm": 2.305095911026001, + "learning_rate": 7.987835608603241e-06, + "loss": 1.0533, + "step": 2259 + }, + { + "epoch": 1.7105014191106906, + "grad_norm": 2.25026535987854, + "learning_rate": 7.979972849710022e-06, + "loss": 1.1497, + "step": 2260 + }, + { + "epoch": 1.7112582781456953, + "grad_norm": 2.259713888168335, + "learning_rate": 7.972111393540079e-06, + "loss": 1.1364, + "step": 2261 + }, + { + "epoch": 1.7120151371807002, + "grad_norm": 2.3140814304351807, + "learning_rate": 7.964251245160795e-06, + "loss": 1.1363, + "step": 2262 + }, + { + "epoch": 1.7127719962157049, + "grad_norm": 2.25529408454895, + "learning_rate": 7.956392409638693e-06, + "loss": 1.1304, + "step": 2263 + }, + { + "epoch": 1.7135288552507095, + "grad_norm": 2.118211030960083, + "learning_rate": 7.948534892039462e-06, + "loss": 1.1227, + "step": 2264 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 2.260540008544922, + "learning_rate": 7.940678697427945e-06, + "loss": 1.0716, + "step": 2265 + }, + { + "epoch": 1.7150425733207189, + "grad_norm": 2.334322690963745, + "learning_rate": 7.932823830868114e-06, + "loss": 1.1458, + "step": 2266 + }, + { + "epoch": 1.7157994323557237, + "grad_norm": 2.1086597442626953, + "learning_rate": 7.9249702974231e-06, + "loss": 1.1264, + "step": 2267 + }, + { + "epoch": 1.7165562913907286, + "grad_norm": 2.286928176879883, + "learning_rate": 7.917118102155175e-06, + "loss": 1.1079, + "step": 2268 + }, + { + "epoch": 1.717313150425733, + "grad_norm": 2.132174491882324, + "learning_rate": 7.909267250125743e-06, + "loss": 1.1201, + "step": 2269 + }, + { + "epoch": 1.718070009460738, + "grad_norm": 2.0687386989593506, + "learning_rate": 7.901417746395338e-06, + "loss": 1.0981, + "step": 2270 + }, + { + "epoch": 1.7188268684957426, + "grad_norm": 2.2579548358917236, + "learning_rate": 7.893569596023638e-06, + "loss": 1.1193, + "step": 2271 + }, + { + "epoch": 1.7195837275307473, + "grad_norm": 2.1385035514831543, + "learning_rate": 7.885722804069435e-06, + "loss": 1.1826, + "step": 2272 + }, + { + "epoch": 1.7203405865657522, + "grad_norm": 1.9475092887878418, + "learning_rate": 7.877877375590657e-06, + "loss": 1.101, + "step": 2273 + }, + { + "epoch": 1.7210974456007568, + "grad_norm": 2.2321999073028564, + "learning_rate": 7.87003331564434e-06, + "loss": 1.121, + "step": 2274 + }, + { + "epoch": 1.7218543046357615, + "grad_norm": 2.091757297515869, + "learning_rate": 7.86219062928665e-06, + "loss": 1.0453, + "step": 2275 + }, + { + "epoch": 1.7226111636707664, + "grad_norm": 1.965161681175232, + "learning_rate": 7.854349321572868e-06, + "loss": 1.1628, + "step": 2276 + }, + { + "epoch": 1.723368022705771, + "grad_norm": 2.090461492538452, + "learning_rate": 7.846509397557372e-06, + "loss": 1.1603, + "step": 2277 + }, + { + "epoch": 1.7241248817407757, + "grad_norm": 2.0913045406341553, + "learning_rate": 7.83867086229366e-06, + "loss": 1.186, + "step": 2278 + }, + { + "epoch": 1.7248817407757806, + "grad_norm": 2.137932300567627, + "learning_rate": 7.83083372083434e-06, + "loss": 1.129, + "step": 2279 + }, + { + "epoch": 1.7256385998107853, + "grad_norm": 2.1641454696655273, + "learning_rate": 7.822997978231101e-06, + "loss": 1.1261, + "step": 2280 + }, + { + "epoch": 1.72639545884579, + "grad_norm": 1.977129340171814, + "learning_rate": 7.815163639534752e-06, + "loss": 1.1332, + "step": 2281 + }, + { + "epoch": 1.7271523178807948, + "grad_norm": 2.3945367336273193, + "learning_rate": 7.807330709795191e-06, + "loss": 1.0711, + "step": 2282 + }, + { + "epoch": 1.7279091769157995, + "grad_norm": 1.9358054399490356, + "learning_rate": 7.799499194061395e-06, + "loss": 1.0986, + "step": 2283 + }, + { + "epoch": 1.7286660359508041, + "grad_norm": 2.242386817932129, + "learning_rate": 7.791669097381447e-06, + "loss": 1.1113, + "step": 2284 + }, + { + "epoch": 1.729422894985809, + "grad_norm": 2.1633381843566895, + "learning_rate": 7.783840424802504e-06, + "loss": 1.1581, + "step": 2285 + }, + { + "epoch": 1.7301797540208135, + "grad_norm": 2.0929739475250244, + "learning_rate": 7.776013181370813e-06, + "loss": 1.1479, + "step": 2286 + }, + { + "epoch": 1.7309366130558184, + "grad_norm": 2.2305848598480225, + "learning_rate": 7.768187372131693e-06, + "loss": 1.1683, + "step": 2287 + }, + { + "epoch": 1.7316934720908232, + "grad_norm": 1.938439130783081, + "learning_rate": 7.76036300212954e-06, + "loss": 1.0774, + "step": 2288 + }, + { + "epoch": 1.7324503311258277, + "grad_norm": 2.031890869140625, + "learning_rate": 7.752540076407829e-06, + "loss": 1.0838, + "step": 2289 + }, + { + "epoch": 1.7332071901608326, + "grad_norm": 2.242338180541992, + "learning_rate": 7.744718600009093e-06, + "loss": 1.1161, + "step": 2290 + }, + { + "epoch": 1.7339640491958372, + "grad_norm": 2.1008989810943604, + "learning_rate": 7.736898577974936e-06, + "loss": 1.071, + "step": 2291 + }, + { + "epoch": 1.734720908230842, + "grad_norm": 1.9958288669586182, + "learning_rate": 7.72908001534603e-06, + "loss": 1.0919, + "step": 2292 + }, + { + "epoch": 1.7354777672658468, + "grad_norm": 2.2547950744628906, + "learning_rate": 7.7212629171621e-06, + "loss": 1.1024, + "step": 2293 + }, + { + "epoch": 1.7362346263008515, + "grad_norm": 2.198230743408203, + "learning_rate": 7.713447288461922e-06, + "loss": 1.1567, + "step": 2294 + }, + { + "epoch": 1.7369914853358561, + "grad_norm": 2.047135591506958, + "learning_rate": 7.705633134283342e-06, + "loss": 1.1345, + "step": 2295 + }, + { + "epoch": 1.737748344370861, + "grad_norm": 2.021092176437378, + "learning_rate": 7.697820459663234e-06, + "loss": 1.0968, + "step": 2296 + }, + { + "epoch": 1.7385052034058657, + "grad_norm": 2.04164719581604, + "learning_rate": 7.690009269637535e-06, + "loss": 1.1234, + "step": 2297 + }, + { + "epoch": 1.7392620624408703, + "grad_norm": 2.0042457580566406, + "learning_rate": 7.68219956924122e-06, + "loss": 1.1361, + "step": 2298 + }, + { + "epoch": 1.7400189214758752, + "grad_norm": 2.207336902618408, + "learning_rate": 7.674391363508293e-06, + "loss": 1.1241, + "step": 2299 + }, + { + "epoch": 1.7407757805108799, + "grad_norm": 2.4397289752960205, + "learning_rate": 7.666584657471819e-06, + "loss": 1.1499, + "step": 2300 + }, + { + "epoch": 1.7415326395458846, + "grad_norm": 1.9008210897445679, + "learning_rate": 7.65877945616387e-06, + "loss": 1.075, + "step": 2301 + }, + { + "epoch": 1.7422894985808894, + "grad_norm": 2.0731241703033447, + "learning_rate": 7.650975764615564e-06, + "loss": 1.1104, + "step": 2302 + }, + { + "epoch": 1.7430463576158939, + "grad_norm": 2.0302274227142334, + "learning_rate": 7.643173587857043e-06, + "loss": 1.1129, + "step": 2303 + }, + { + "epoch": 1.7438032166508988, + "grad_norm": 2.3482978343963623, + "learning_rate": 7.635372930917471e-06, + "loss": 1.1239, + "step": 2304 + }, + { + "epoch": 1.7445600756859037, + "grad_norm": 2.1537070274353027, + "learning_rate": 7.627573798825028e-06, + "loss": 1.1213, + "step": 2305 + }, + { + "epoch": 1.745316934720908, + "grad_norm": 3.2694427967071533, + "learning_rate": 7.619776196606923e-06, + "loss": 1.1201, + "step": 2306 + }, + { + "epoch": 1.746073793755913, + "grad_norm": 1.9938985109329224, + "learning_rate": 7.611980129289362e-06, + "loss": 1.1202, + "step": 2307 + }, + { + "epoch": 1.7468306527909176, + "grad_norm": 1.9866852760314941, + "learning_rate": 7.604185601897578e-06, + "loss": 1.1716, + "step": 2308 + }, + { + "epoch": 1.7475875118259223, + "grad_norm": 2.149052143096924, + "learning_rate": 7.596392619455805e-06, + "loss": 1.0911, + "step": 2309 + }, + { + "epoch": 1.7483443708609272, + "grad_norm": 2.1124277114868164, + "learning_rate": 7.588601186987277e-06, + "loss": 1.1686, + "step": 2310 + }, + { + "epoch": 1.7491012298959319, + "grad_norm": 2.1678647994995117, + "learning_rate": 7.5808113095142334e-06, + "loss": 1.1356, + "step": 2311 + }, + { + "epoch": 1.7498580889309365, + "grad_norm": 2.2444238662719727, + "learning_rate": 7.573022992057911e-06, + "loss": 1.1302, + "step": 2312 + }, + { + "epoch": 1.7506149479659414, + "grad_norm": 2.296766757965088, + "learning_rate": 7.565236239638542e-06, + "loss": 1.1317, + "step": 2313 + }, + { + "epoch": 1.751371807000946, + "grad_norm": 2.106170177459717, + "learning_rate": 7.557451057275346e-06, + "loss": 1.138, + "step": 2314 + }, + { + "epoch": 1.7521286660359507, + "grad_norm": 1.8964704275131226, + "learning_rate": 7.549667449986533e-06, + "loss": 1.1121, + "step": 2315 + }, + { + "epoch": 1.7528855250709556, + "grad_norm": 2.2497787475585938, + "learning_rate": 7.541885422789297e-06, + "loss": 1.1607, + "step": 2316 + }, + { + "epoch": 1.7536423841059603, + "grad_norm": 1.980980634689331, + "learning_rate": 7.53410498069982e-06, + "loss": 1.0806, + "step": 2317 + }, + { + "epoch": 1.754399243140965, + "grad_norm": 2.030378818511963, + "learning_rate": 7.526326128733247e-06, + "loss": 1.1, + "step": 2318 + }, + { + "epoch": 1.7551561021759698, + "grad_norm": 1.9196511507034302, + "learning_rate": 7.5185488719037105e-06, + "loss": 1.1771, + "step": 2319 + }, + { + "epoch": 1.7559129612109745, + "grad_norm": 1.9450955390930176, + "learning_rate": 7.510773215224318e-06, + "loss": 1.1347, + "step": 2320 + }, + { + "epoch": 1.7566698202459792, + "grad_norm": 1.9371559619903564, + "learning_rate": 7.502999163707131e-06, + "loss": 1.1026, + "step": 2321 + }, + { + "epoch": 1.757426679280984, + "grad_norm": 2.021090269088745, + "learning_rate": 7.4952267223631865e-06, + "loss": 1.1514, + "step": 2322 + }, + { + "epoch": 1.7581835383159885, + "grad_norm": 2.016483783721924, + "learning_rate": 7.487455896202487e-06, + "loss": 1.1315, + "step": 2323 + }, + { + "epoch": 1.7589403973509934, + "grad_norm": 2.051363945007324, + "learning_rate": 7.479686690233981e-06, + "loss": 1.1094, + "step": 2324 + }, + { + "epoch": 1.759697256385998, + "grad_norm": 2.3509905338287354, + "learning_rate": 7.471919109465584e-06, + "loss": 1.1104, + "step": 2325 + }, + { + "epoch": 1.7604541154210027, + "grad_norm": 2.0247390270233154, + "learning_rate": 7.46415315890416e-06, + "loss": 1.1122, + "step": 2326 + }, + { + "epoch": 1.7612109744560076, + "grad_norm": 2.1923465728759766, + "learning_rate": 7.456388843555525e-06, + "loss": 1.1308, + "step": 2327 + }, + { + "epoch": 1.7619678334910123, + "grad_norm": 2.132502555847168, + "learning_rate": 7.448626168424434e-06, + "loss": 1.1637, + "step": 2328 + }, + { + "epoch": 1.762724692526017, + "grad_norm": 1.9766474962234497, + "learning_rate": 7.440865138514587e-06, + "loss": 1.1019, + "step": 2329 + }, + { + "epoch": 1.7634815515610218, + "grad_norm": 2.1354434490203857, + "learning_rate": 7.433105758828631e-06, + "loss": 1.0869, + "step": 2330 + }, + { + "epoch": 1.7642384105960265, + "grad_norm": 2.135441303253174, + "learning_rate": 7.425348034368143e-06, + "loss": 1.1077, + "step": 2331 + }, + { + "epoch": 1.7649952696310311, + "grad_norm": 1.9634079933166504, + "learning_rate": 7.41759197013363e-06, + "loss": 1.0686, + "step": 2332 + }, + { + "epoch": 1.765752128666036, + "grad_norm": 2.202788829803467, + "learning_rate": 7.409837571124535e-06, + "loss": 1.0706, + "step": 2333 + }, + { + "epoch": 1.7665089877010407, + "grad_norm": 2.3422369956970215, + "learning_rate": 7.40208484233923e-06, + "loss": 1.1673, + "step": 2334 + }, + { + "epoch": 1.7672658467360454, + "grad_norm": 2.338772773742676, + "learning_rate": 7.394333788774995e-06, + "loss": 1.1037, + "step": 2335 + }, + { + "epoch": 1.7680227057710503, + "grad_norm": 2.2548608779907227, + "learning_rate": 7.386584415428051e-06, + "loss": 1.1583, + "step": 2336 + }, + { + "epoch": 1.768779564806055, + "grad_norm": 2.1475353240966797, + "learning_rate": 7.3788367272935235e-06, + "loss": 1.1232, + "step": 2337 + }, + { + "epoch": 1.7695364238410596, + "grad_norm": 2.1223628520965576, + "learning_rate": 7.37109072936545e-06, + "loss": 1.1164, + "step": 2338 + }, + { + "epoch": 1.7702932828760645, + "grad_norm": 2.0361294746398926, + "learning_rate": 7.363346426636786e-06, + "loss": 1.16, + "step": 2339 + }, + { + "epoch": 1.771050141911069, + "grad_norm": 2.1143364906311035, + "learning_rate": 7.355603824099389e-06, + "loss": 1.142, + "step": 2340 + }, + { + "epoch": 1.7718070009460738, + "grad_norm": 2.2214882373809814, + "learning_rate": 7.347862926744027e-06, + "loss": 1.1375, + "step": 2341 + }, + { + "epoch": 1.7725638599810787, + "grad_norm": 1.9182907342910767, + "learning_rate": 7.34012373956036e-06, + "loss": 1.1099, + "step": 2342 + }, + { + "epoch": 1.7733207190160831, + "grad_norm": 2.0895349979400635, + "learning_rate": 7.332386267536949e-06, + "loss": 1.1397, + "step": 2343 + }, + { + "epoch": 1.774077578051088, + "grad_norm": 2.078885793685913, + "learning_rate": 7.3246505156612554e-06, + "loss": 1.1296, + "step": 2344 + }, + { + "epoch": 1.7748344370860927, + "grad_norm": 2.180187702178955, + "learning_rate": 7.3169164889196235e-06, + "loss": 1.1458, + "step": 2345 + }, + { + "epoch": 1.7755912961210973, + "grad_norm": 2.3137030601501465, + "learning_rate": 7.309184192297289e-06, + "loss": 1.0713, + "step": 2346 + }, + { + "epoch": 1.7763481551561022, + "grad_norm": 2.0382871627807617, + "learning_rate": 7.3014536307783725e-06, + "loss": 1.1288, + "step": 2347 + }, + { + "epoch": 1.777105014191107, + "grad_norm": 2.004988670349121, + "learning_rate": 7.293724809345879e-06, + "loss": 1.1304, + "step": 2348 + }, + { + "epoch": 1.7778618732261116, + "grad_norm": 2.0899946689605713, + "learning_rate": 7.285997732981683e-06, + "loss": 1.1095, + "step": 2349 + }, + { + "epoch": 1.7786187322611164, + "grad_norm": 2.197770118713379, + "learning_rate": 7.2782724066665475e-06, + "loss": 1.1219, + "step": 2350 + }, + { + "epoch": 1.779375591296121, + "grad_norm": 1.9547758102416992, + "learning_rate": 7.270548835380095e-06, + "loss": 1.0707, + "step": 2351 + }, + { + "epoch": 1.7801324503311258, + "grad_norm": 2.0842347145080566, + "learning_rate": 7.262827024100821e-06, + "loss": 1.1485, + "step": 2352 + }, + { + "epoch": 1.7808893093661307, + "grad_norm": 2.189990520477295, + "learning_rate": 7.255106977806092e-06, + "loss": 1.1403, + "step": 2353 + }, + { + "epoch": 1.7816461684011353, + "grad_norm": 2.299306869506836, + "learning_rate": 7.247388701472129e-06, + "loss": 1.1001, + "step": 2354 + }, + { + "epoch": 1.78240302743614, + "grad_norm": 2.0084657669067383, + "learning_rate": 7.239672200074012e-06, + "loss": 1.0777, + "step": 2355 + }, + { + "epoch": 1.7831598864711449, + "grad_norm": 1.9171555042266846, + "learning_rate": 7.231957478585687e-06, + "loss": 1.1022, + "step": 2356 + }, + { + "epoch": 1.7839167455061493, + "grad_norm": 2.116420030593872, + "learning_rate": 7.224244541979941e-06, + "loss": 1.0945, + "step": 2357 + }, + { + "epoch": 1.7846736045411542, + "grad_norm": 2.250598669052124, + "learning_rate": 7.216533395228419e-06, + "loss": 1.1327, + "step": 2358 + }, + { + "epoch": 1.785430463576159, + "grad_norm": 2.0988495349884033, + "learning_rate": 7.208824043301604e-06, + "loss": 1.1452, + "step": 2359 + }, + { + "epoch": 1.7861873226111635, + "grad_norm": 1.9777265787124634, + "learning_rate": 7.201116491168829e-06, + "loss": 1.0838, + "step": 2360 + }, + { + "epoch": 1.7869441816461684, + "grad_norm": 2.1055500507354736, + "learning_rate": 7.19341074379827e-06, + "loss": 1.0996, + "step": 2361 + }, + { + "epoch": 1.787701040681173, + "grad_norm": 1.8813843727111816, + "learning_rate": 7.185706806156921e-06, + "loss": 1.1238, + "step": 2362 + }, + { + "epoch": 1.7884578997161777, + "grad_norm": 1.9652965068817139, + "learning_rate": 7.178004683210634e-06, + "loss": 1.1141, + "step": 2363 + }, + { + "epoch": 1.7892147587511826, + "grad_norm": 2.2869348526000977, + "learning_rate": 7.170304379924078e-06, + "loss": 1.1882, + "step": 2364 + }, + { + "epoch": 1.7899716177861873, + "grad_norm": 2.1046929359436035, + "learning_rate": 7.162605901260749e-06, + "loss": 1.0947, + "step": 2365 + }, + { + "epoch": 1.790728476821192, + "grad_norm": 2.0936052799224854, + "learning_rate": 7.1549092521829676e-06, + "loss": 1.1371, + "step": 2366 + }, + { + "epoch": 1.7914853358561968, + "grad_norm": 2.0121428966522217, + "learning_rate": 7.147214437651881e-06, + "loss": 1.102, + "step": 2367 + }, + { + "epoch": 1.7922421948912015, + "grad_norm": 2.144970178604126, + "learning_rate": 7.139521462627446e-06, + "loss": 1.1266, + "step": 2368 + }, + { + "epoch": 1.7929990539262062, + "grad_norm": 2.3722221851348877, + "learning_rate": 7.1318303320684356e-06, + "loss": 1.1499, + "step": 2369 + }, + { + "epoch": 1.793755912961211, + "grad_norm": 2.255847454071045, + "learning_rate": 7.124141050932441e-06, + "loss": 1.1243, + "step": 2370 + }, + { + "epoch": 1.7945127719962157, + "grad_norm": 2.1879565715789795, + "learning_rate": 7.116453624175847e-06, + "loss": 1.0995, + "step": 2371 + }, + { + "epoch": 1.7952696310312204, + "grad_norm": 2.267245292663574, + "learning_rate": 7.108768056753863e-06, + "loss": 1.156, + "step": 2372 + }, + { + "epoch": 1.7960264900662253, + "grad_norm": 2.1807005405426025, + "learning_rate": 7.101084353620476e-06, + "loss": 1.1588, + "step": 2373 + }, + { + "epoch": 1.79678334910123, + "grad_norm": 2.2159693241119385, + "learning_rate": 7.0934025197284924e-06, + "loss": 1.0647, + "step": 2374 + }, + { + "epoch": 1.7975402081362346, + "grad_norm": 2.1058151721954346, + "learning_rate": 7.085722560029507e-06, + "loss": 1.1166, + "step": 2375 + }, + { + "epoch": 1.7982970671712395, + "grad_norm": 2.202956438064575, + "learning_rate": 7.0780444794738945e-06, + "loss": 1.1524, + "step": 2376 + }, + { + "epoch": 1.799053926206244, + "grad_norm": 2.15413761138916, + "learning_rate": 7.070368283010836e-06, + "loss": 1.1331, + "step": 2377 + }, + { + "epoch": 1.7998107852412488, + "grad_norm": 1.968179702758789, + "learning_rate": 7.062693975588291e-06, + "loss": 1.0785, + "step": 2378 + }, + { + "epoch": 1.8005676442762537, + "grad_norm": 2.2872471809387207, + "learning_rate": 7.0550215621529965e-06, + "loss": 1.1364, + "step": 2379 + }, + { + "epoch": 1.8013245033112582, + "grad_norm": 2.0598936080932617, + "learning_rate": 7.047351047650476e-06, + "loss": 1.1238, + "step": 2380 + }, + { + "epoch": 1.802081362346263, + "grad_norm": 2.055774688720703, + "learning_rate": 7.039682437025028e-06, + "loss": 1.1336, + "step": 2381 + }, + { + "epoch": 1.8028382213812677, + "grad_norm": 2.1142072677612305, + "learning_rate": 7.032015735219719e-06, + "loss": 1.1216, + "step": 2382 + }, + { + "epoch": 1.8035950804162724, + "grad_norm": 2.067873001098633, + "learning_rate": 7.024350947176391e-06, + "loss": 1.1253, + "step": 2383 + }, + { + "epoch": 1.8043519394512773, + "grad_norm": 1.906582236289978, + "learning_rate": 7.016688077835645e-06, + "loss": 1.1002, + "step": 2384 + }, + { + "epoch": 1.805108798486282, + "grad_norm": 2.005889892578125, + "learning_rate": 7.009027132136853e-06, + "loss": 1.135, + "step": 2385 + }, + { + "epoch": 1.8058656575212866, + "grad_norm": 1.9194884300231934, + "learning_rate": 7.001368115018144e-06, + "loss": 1.0872, + "step": 2386 + }, + { + "epoch": 1.8066225165562915, + "grad_norm": 2.044262647628784, + "learning_rate": 6.993711031416402e-06, + "loss": 1.0973, + "step": 2387 + }, + { + "epoch": 1.8073793755912961, + "grad_norm": 2.0718541145324707, + "learning_rate": 6.986055886267265e-06, + "loss": 1.1224, + "step": 2388 + }, + { + "epoch": 1.8081362346263008, + "grad_norm": 2.132376194000244, + "learning_rate": 6.97840268450513e-06, + "loss": 1.083, + "step": 2389 + }, + { + "epoch": 1.8088930936613057, + "grad_norm": 2.1769330501556396, + "learning_rate": 6.970751431063124e-06, + "loss": 1.148, + "step": 2390 + }, + { + "epoch": 1.8096499526963103, + "grad_norm": 2.1449358463287354, + "learning_rate": 6.963102130873134e-06, + "loss": 1.0967, + "step": 2391 + }, + { + "epoch": 1.810406811731315, + "grad_norm": 1.9859085083007812, + "learning_rate": 6.955454788865785e-06, + "loss": 1.1101, + "step": 2392 + }, + { + "epoch": 1.81116367076632, + "grad_norm": 2.211151361465454, + "learning_rate": 6.947809409970431e-06, + "loss": 1.114, + "step": 2393 + }, + { + "epoch": 1.8119205298013243, + "grad_norm": 2.0564661026000977, + "learning_rate": 6.940165999115169e-06, + "loss": 1.0703, + "step": 2394 + }, + { + "epoch": 1.8126773888363292, + "grad_norm": 2.2020647525787354, + "learning_rate": 6.932524561226824e-06, + "loss": 1.0784, + "step": 2395 + }, + { + "epoch": 1.8134342478713341, + "grad_norm": 2.0232954025268555, + "learning_rate": 6.924885101230955e-06, + "loss": 1.1231, + "step": 2396 + }, + { + "epoch": 1.8141911069063386, + "grad_norm": 2.0655837059020996, + "learning_rate": 6.917247624051836e-06, + "loss": 1.102, + "step": 2397 + }, + { + "epoch": 1.8149479659413434, + "grad_norm": 2.0320346355438232, + "learning_rate": 6.90961213461247e-06, + "loss": 1.119, + "step": 2398 + }, + { + "epoch": 1.815704824976348, + "grad_norm": 2.415329694747925, + "learning_rate": 6.901978637834579e-06, + "loss": 1.1015, + "step": 2399 + }, + { + "epoch": 1.8164616840113528, + "grad_norm": 1.962516188621521, + "learning_rate": 6.894347138638595e-06, + "loss": 1.1063, + "step": 2400 + }, + { + "epoch": 1.8172185430463577, + "grad_norm": 2.263796329498291, + "learning_rate": 6.886717641943668e-06, + "loss": 1.0946, + "step": 2401 + }, + { + "epoch": 1.8179754020813623, + "grad_norm": 1.8907090425491333, + "learning_rate": 6.879090152667655e-06, + "loss": 1.0842, + "step": 2402 + }, + { + "epoch": 1.818732261116367, + "grad_norm": 2.3313028812408447, + "learning_rate": 6.871464675727122e-06, + "loss": 1.0972, + "step": 2403 + }, + { + "epoch": 1.8194891201513719, + "grad_norm": 2.123699903488159, + "learning_rate": 6.8638412160373294e-06, + "loss": 1.0953, + "step": 2404 + }, + { + "epoch": 1.8202459791863765, + "grad_norm": 2.1058695316314697, + "learning_rate": 6.856219778512248e-06, + "loss": 1.0983, + "step": 2405 + }, + { + "epoch": 1.8210028382213812, + "grad_norm": 2.0354390144348145, + "learning_rate": 6.8486003680645384e-06, + "loss": 1.1184, + "step": 2406 + }, + { + "epoch": 1.821759697256386, + "grad_norm": 2.448774814605713, + "learning_rate": 6.840982989605554e-06, + "loss": 1.1902, + "step": 2407 + }, + { + "epoch": 1.8225165562913908, + "grad_norm": 2.067413330078125, + "learning_rate": 6.833367648045347e-06, + "loss": 1.0844, + "step": 2408 + }, + { + "epoch": 1.8232734153263954, + "grad_norm": 1.8351151943206787, + "learning_rate": 6.825754348292647e-06, + "loss": 1.0751, + "step": 2409 + }, + { + "epoch": 1.8240302743614003, + "grad_norm": 2.036219835281372, + "learning_rate": 6.8181430952548664e-06, + "loss": 1.1118, + "step": 2410 + }, + { + "epoch": 1.824787133396405, + "grad_norm": 2.2903716564178467, + "learning_rate": 6.810533893838111e-06, + "loss": 1.1085, + "step": 2411 + }, + { + "epoch": 1.8255439924314096, + "grad_norm": 2.1487245559692383, + "learning_rate": 6.802926748947149e-06, + "loss": 1.0766, + "step": 2412 + }, + { + "epoch": 1.8263008514664145, + "grad_norm": 2.073429822921753, + "learning_rate": 6.795321665485434e-06, + "loss": 1.1056, + "step": 2413 + }, + { + "epoch": 1.827057710501419, + "grad_norm": 2.1071133613586426, + "learning_rate": 6.7877186483550865e-06, + "loss": 1.1688, + "step": 2414 + }, + { + "epoch": 1.8278145695364238, + "grad_norm": 2.153792381286621, + "learning_rate": 6.780117702456892e-06, + "loss": 1.1281, + "step": 2415 + }, + { + "epoch": 1.8285714285714287, + "grad_norm": 2.046393632888794, + "learning_rate": 6.772518832690312e-06, + "loss": 1.1413, + "step": 2416 + }, + { + "epoch": 1.8293282876064332, + "grad_norm": 2.0445821285247803, + "learning_rate": 6.764922043953452e-06, + "loss": 1.0761, + "step": 2417 + }, + { + "epoch": 1.830085146641438, + "grad_norm": 2.1296608448028564, + "learning_rate": 6.757327341143093e-06, + "loss": 1.1077, + "step": 2418 + }, + { + "epoch": 1.8308420056764427, + "grad_norm": 2.218290328979492, + "learning_rate": 6.749734729154663e-06, + "loss": 1.0869, + "step": 2419 + }, + { + "epoch": 1.8315988647114474, + "grad_norm": 2.161032199859619, + "learning_rate": 6.742144212882244e-06, + "loss": 1.1116, + "step": 2420 + }, + { + "epoch": 1.8323557237464523, + "grad_norm": 1.979115605354309, + "learning_rate": 6.734555797218567e-06, + "loss": 1.1321, + "step": 2421 + }, + { + "epoch": 1.833112582781457, + "grad_norm": 2.016322135925293, + "learning_rate": 6.726969487055008e-06, + "loss": 1.1265, + "step": 2422 + }, + { + "epoch": 1.8338694418164616, + "grad_norm": 1.943589448928833, + "learning_rate": 6.719385287281589e-06, + "loss": 1.079, + "step": 2423 + }, + { + "epoch": 1.8346263008514665, + "grad_norm": 2.0779478549957275, + "learning_rate": 6.711803202786965e-06, + "loss": 1.161, + "step": 2424 + }, + { + "epoch": 1.8353831598864712, + "grad_norm": 1.9313197135925293, + "learning_rate": 6.7042232384584396e-06, + "loss": 1.114, + "step": 2425 + }, + { + "epoch": 1.8361400189214758, + "grad_norm": 2.177368640899658, + "learning_rate": 6.6966453991819355e-06, + "loss": 1.1141, + "step": 2426 + }, + { + "epoch": 1.8368968779564807, + "grad_norm": 2.155545234680176, + "learning_rate": 6.689069689842015e-06, + "loss": 1.1058, + "step": 2427 + }, + { + "epoch": 1.8376537369914854, + "grad_norm": 2.1615564823150635, + "learning_rate": 6.681496115321863e-06, + "loss": 1.1445, + "step": 2428 + }, + { + "epoch": 1.83841059602649, + "grad_norm": 2.2454423904418945, + "learning_rate": 6.6739246805032895e-06, + "loss": 1.1575, + "step": 2429 + }, + { + "epoch": 1.839167455061495, + "grad_norm": 1.9341751337051392, + "learning_rate": 6.6663553902667345e-06, + "loss": 1.0993, + "step": 2430 + }, + { + "epoch": 1.8399243140964994, + "grad_norm": 2.307654857635498, + "learning_rate": 6.658788249491236e-06, + "loss": 1.0903, + "step": 2431 + }, + { + "epoch": 1.8406811731315043, + "grad_norm": 2.172126293182373, + "learning_rate": 6.651223263054462e-06, + "loss": 1.1384, + "step": 2432 + }, + { + "epoch": 1.8414380321665091, + "grad_norm": 2.061699151992798, + "learning_rate": 6.64366043583269e-06, + "loss": 1.1066, + "step": 2433 + }, + { + "epoch": 1.8421948912015136, + "grad_norm": 2.0565085411071777, + "learning_rate": 6.636099772700797e-06, + "loss": 1.1265, + "step": 2434 + }, + { + "epoch": 1.8429517502365185, + "grad_norm": 2.3713178634643555, + "learning_rate": 6.628541278532276e-06, + "loss": 1.1067, + "step": 2435 + }, + { + "epoch": 1.8437086092715231, + "grad_norm": 2.0300235748291016, + "learning_rate": 6.620984958199217e-06, + "loss": 1.1053, + "step": 2436 + }, + { + "epoch": 1.8444654683065278, + "grad_norm": 1.8853594064712524, + "learning_rate": 6.613430816572308e-06, + "loss": 1.1375, + "step": 2437 + }, + { + "epoch": 1.8452223273415327, + "grad_norm": 2.140911102294922, + "learning_rate": 6.605878858520832e-06, + "loss": 1.1372, + "step": 2438 + }, + { + "epoch": 1.8459791863765374, + "grad_norm": 2.0533270835876465, + "learning_rate": 6.598329088912666e-06, + "loss": 1.1054, + "step": 2439 + }, + { + "epoch": 1.846736045411542, + "grad_norm": 2.0813000202178955, + "learning_rate": 6.59078151261428e-06, + "loss": 1.0635, + "step": 2440 + }, + { + "epoch": 1.847492904446547, + "grad_norm": 2.0938546657562256, + "learning_rate": 6.5832361344907225e-06, + "loss": 1.1368, + "step": 2441 + }, + { + "epoch": 1.8482497634815516, + "grad_norm": 2.1274354457855225, + "learning_rate": 6.57569295940563e-06, + "loss": 1.1446, + "step": 2442 + }, + { + "epoch": 1.8490066225165562, + "grad_norm": 2.2737364768981934, + "learning_rate": 6.5681519922212175e-06, + "loss": 1.1007, + "step": 2443 + }, + { + "epoch": 1.8497634815515611, + "grad_norm": 2.0562212467193604, + "learning_rate": 6.560613237798282e-06, + "loss": 1.1033, + "step": 2444 + }, + { + "epoch": 1.8505203405865658, + "grad_norm": 2.1894006729125977, + "learning_rate": 6.553076700996186e-06, + "loss": 1.1733, + "step": 2445 + }, + { + "epoch": 1.8512771996215704, + "grad_norm": 2.1526927947998047, + "learning_rate": 6.545542386672864e-06, + "loss": 1.1254, + "step": 2446 + }, + { + "epoch": 1.8520340586565753, + "grad_norm": 2.335092306137085, + "learning_rate": 6.538010299684827e-06, + "loss": 1.089, + "step": 2447 + }, + { + "epoch": 1.85279091769158, + "grad_norm": 2.1147849559783936, + "learning_rate": 6.530480444887135e-06, + "loss": 1.1075, + "step": 2448 + }, + { + "epoch": 1.8535477767265847, + "grad_norm": 1.9990819692611694, + "learning_rate": 6.522952827133424e-06, + "loss": 1.1069, + "step": 2449 + }, + { + "epoch": 1.8543046357615895, + "grad_norm": 2.2554056644439697, + "learning_rate": 6.515427451275879e-06, + "loss": 1.1205, + "step": 2450 + }, + { + "epoch": 1.855061494796594, + "grad_norm": 2.143373489379883, + "learning_rate": 6.507904322165242e-06, + "loss": 1.1, + "step": 2451 + }, + { + "epoch": 1.8558183538315989, + "grad_norm": 2.145324468612671, + "learning_rate": 6.500383444650808e-06, + "loss": 1.124, + "step": 2452 + }, + { + "epoch": 1.8565752128666035, + "grad_norm": 2.0681822299957275, + "learning_rate": 6.492864823580418e-06, + "loss": 1.1404, + "step": 2453 + }, + { + "epoch": 1.8573320719016082, + "grad_norm": 2.0816290378570557, + "learning_rate": 6.485348463800467e-06, + "loss": 1.121, + "step": 2454 + }, + { + "epoch": 1.858088930936613, + "grad_norm": 2.210402488708496, + "learning_rate": 6.477834370155879e-06, + "loss": 1.081, + "step": 2455 + }, + { + "epoch": 1.8588457899716178, + "grad_norm": 2.258357286453247, + "learning_rate": 6.4703225474901266e-06, + "loss": 1.1221, + "step": 2456 + }, + { + "epoch": 1.8596026490066224, + "grad_norm": 2.336432456970215, + "learning_rate": 6.462813000645216e-06, + "loss": 1.1288, + "step": 2457 + }, + { + "epoch": 1.8603595080416273, + "grad_norm": 2.224451780319214, + "learning_rate": 6.4553057344616885e-06, + "loss": 1.1213, + "step": 2458 + }, + { + "epoch": 1.861116367076632, + "grad_norm": 2.012571096420288, + "learning_rate": 6.447800753778608e-06, + "loss": 1.1079, + "step": 2459 + }, + { + "epoch": 1.8618732261116366, + "grad_norm": 2.0077013969421387, + "learning_rate": 6.440298063433578e-06, + "loss": 1.1139, + "step": 2460 + }, + { + "epoch": 1.8626300851466415, + "grad_norm": 2.0572779178619385, + "learning_rate": 6.432797668262713e-06, + "loss": 1.1225, + "step": 2461 + }, + { + "epoch": 1.8633869441816462, + "grad_norm": 2.052415609359741, + "learning_rate": 6.425299573100653e-06, + "loss": 1.1232, + "step": 2462 + }, + { + "epoch": 1.8641438032166509, + "grad_norm": 2.1070804595947266, + "learning_rate": 6.41780378278056e-06, + "loss": 1.1425, + "step": 2463 + }, + { + "epoch": 1.8649006622516557, + "grad_norm": 2.1018309593200684, + "learning_rate": 6.410310302134102e-06, + "loss": 1.124, + "step": 2464 + }, + { + "epoch": 1.8656575212866604, + "grad_norm": 2.104137897491455, + "learning_rate": 6.4028191359914635e-06, + "loss": 1.1366, + "step": 2465 + }, + { + "epoch": 1.866414380321665, + "grad_norm": 2.196840763092041, + "learning_rate": 6.395330289181339e-06, + "loss": 1.1138, + "step": 2466 + }, + { + "epoch": 1.86717123935667, + "grad_norm": 2.0204899311065674, + "learning_rate": 6.38784376653092e-06, + "loss": 1.1571, + "step": 2467 + }, + { + "epoch": 1.8679280983916744, + "grad_norm": 2.1718480587005615, + "learning_rate": 6.380359572865909e-06, + "loss": 1.1265, + "step": 2468 + }, + { + "epoch": 1.8686849574266793, + "grad_norm": 2.2680718898773193, + "learning_rate": 6.372877713010501e-06, + "loss": 1.1218, + "step": 2469 + }, + { + "epoch": 1.8694418164616842, + "grad_norm": 1.9217084646224976, + "learning_rate": 6.365398191787388e-06, + "loss": 1.0846, + "step": 2470 + }, + { + "epoch": 1.8701986754966886, + "grad_norm": 2.0585711002349854, + "learning_rate": 6.35792101401776e-06, + "loss": 1.1274, + "step": 2471 + }, + { + "epoch": 1.8709555345316935, + "grad_norm": 1.989283800125122, + "learning_rate": 6.350446184521285e-06, + "loss": 1.095, + "step": 2472 + }, + { + "epoch": 1.8717123935666982, + "grad_norm": 1.886738657951355, + "learning_rate": 6.3429737081161265e-06, + "loss": 1.1196, + "step": 2473 + }, + { + "epoch": 1.8724692526017028, + "grad_norm": 1.9688234329223633, + "learning_rate": 6.335503589618933e-06, + "loss": 1.143, + "step": 2474 + }, + { + "epoch": 1.8732261116367077, + "grad_norm": 2.37060284614563, + "learning_rate": 6.328035833844823e-06, + "loss": 1.1088, + "step": 2475 + }, + { + "epoch": 1.8739829706717124, + "grad_norm": 2.3199589252471924, + "learning_rate": 6.320570445607399e-06, + "loss": 1.1072, + "step": 2476 + }, + { + "epoch": 1.874739829706717, + "grad_norm": 1.914215087890625, + "learning_rate": 6.313107429718741e-06, + "loss": 1.1222, + "step": 2477 + }, + { + "epoch": 1.875496688741722, + "grad_norm": 2.3843131065368652, + "learning_rate": 6.305646790989391e-06, + "loss": 1.1509, + "step": 2478 + }, + { + "epoch": 1.8762535477767266, + "grad_norm": 2.1501553058624268, + "learning_rate": 6.298188534228365e-06, + "loss": 1.0925, + "step": 2479 + }, + { + "epoch": 1.8770104068117313, + "grad_norm": 2.103590965270996, + "learning_rate": 6.290732664243141e-06, + "loss": 1.1068, + "step": 2480 + }, + { + "epoch": 1.8777672658467361, + "grad_norm": 2.1373817920684814, + "learning_rate": 6.283279185839658e-06, + "loss": 1.1228, + "step": 2481 + }, + { + "epoch": 1.8785241248817408, + "grad_norm": 1.9448984861373901, + "learning_rate": 6.275828103822317e-06, + "loss": 1.1138, + "step": 2482 + }, + { + "epoch": 1.8792809839167455, + "grad_norm": 2.133575916290283, + "learning_rate": 6.268379422993969e-06, + "loss": 1.137, + "step": 2483 + }, + { + "epoch": 1.8800378429517504, + "grad_norm": 2.0521798133850098, + "learning_rate": 6.26093314815592e-06, + "loss": 1.1077, + "step": 2484 + }, + { + "epoch": 1.8807947019867548, + "grad_norm": 2.154632091522217, + "learning_rate": 6.253489284107929e-06, + "loss": 1.0963, + "step": 2485 + }, + { + "epoch": 1.8815515610217597, + "grad_norm": 2.0606281757354736, + "learning_rate": 6.246047835648191e-06, + "loss": 1.1233, + "step": 2486 + }, + { + "epoch": 1.8823084200567646, + "grad_norm": 1.9377020597457886, + "learning_rate": 6.238608807573355e-06, + "loss": 1.128, + "step": 2487 + }, + { + "epoch": 1.883065279091769, + "grad_norm": 2.133552074432373, + "learning_rate": 6.231172204678507e-06, + "loss": 1.0872, + "step": 2488 + }, + { + "epoch": 1.883822138126774, + "grad_norm": 2.1637847423553467, + "learning_rate": 6.2237380317571626e-06, + "loss": 1.1051, + "step": 2489 + }, + { + "epoch": 1.8845789971617786, + "grad_norm": 2.0870983600616455, + "learning_rate": 6.216306293601277e-06, + "loss": 1.1296, + "step": 2490 + }, + { + "epoch": 1.8853358561967832, + "grad_norm": 2.129365921020508, + "learning_rate": 6.20887699500124e-06, + "loss": 1.1125, + "step": 2491 + }, + { + "epoch": 1.8860927152317881, + "grad_norm": 1.887802004814148, + "learning_rate": 6.20145014074586e-06, + "loss": 1.0531, + "step": 2492 + }, + { + "epoch": 1.8868495742667928, + "grad_norm": 2.163595199584961, + "learning_rate": 6.194025735622371e-06, + "loss": 1.0727, + "step": 2493 + }, + { + "epoch": 1.8876064333017974, + "grad_norm": 1.8616597652435303, + "learning_rate": 6.186603784416441e-06, + "loss": 1.0561, + "step": 2494 + }, + { + "epoch": 1.8883632923368023, + "grad_norm": 2.1504313945770264, + "learning_rate": 6.179184291912138e-06, + "loss": 1.1093, + "step": 2495 + }, + { + "epoch": 1.889120151371807, + "grad_norm": 1.997025728225708, + "learning_rate": 6.171767262891958e-06, + "loss": 1.0991, + "step": 2496 + }, + { + "epoch": 1.8898770104068117, + "grad_norm": 2.141857147216797, + "learning_rate": 6.164352702136799e-06, + "loss": 1.0897, + "step": 2497 + }, + { + "epoch": 1.8906338694418166, + "grad_norm": 1.9679754972457886, + "learning_rate": 6.15694061442598e-06, + "loss": 1.0971, + "step": 2498 + }, + { + "epoch": 1.8913907284768212, + "grad_norm": 2.048257350921631, + "learning_rate": 6.149531004537222e-06, + "loss": 1.1139, + "step": 2499 + }, + { + "epoch": 1.8921475875118259, + "grad_norm": 2.4383885860443115, + "learning_rate": 6.1421238772466375e-06, + "loss": 1.1028, + "step": 2500 + }, + { + "epoch": 1.8929044465468308, + "grad_norm": 2.0352723598480225, + "learning_rate": 6.134719237328751e-06, + "loss": 1.0957, + "step": 2501 + }, + { + "epoch": 1.8936613055818354, + "grad_norm": 2.1713624000549316, + "learning_rate": 6.127317089556489e-06, + "loss": 1.0726, + "step": 2502 + }, + { + "epoch": 1.89441816461684, + "grad_norm": 2.2224864959716797, + "learning_rate": 6.119917438701151e-06, + "loss": 1.0919, + "step": 2503 + }, + { + "epoch": 1.895175023651845, + "grad_norm": 2.1596179008483887, + "learning_rate": 6.112520289532445e-06, + "loss": 1.1273, + "step": 2504 + }, + { + "epoch": 1.8959318826868494, + "grad_norm": 2.018328905105591, + "learning_rate": 6.105125646818463e-06, + "loss": 1.1354, + "step": 2505 + }, + { + "epoch": 1.8966887417218543, + "grad_norm": 2.1755290031433105, + "learning_rate": 6.097733515325671e-06, + "loss": 1.1037, + "step": 2506 + }, + { + "epoch": 1.8974456007568592, + "grad_norm": 2.172973871231079, + "learning_rate": 6.090343899818931e-06, + "loss": 1.0656, + "step": 2507 + }, + { + "epoch": 1.8982024597918636, + "grad_norm": 2.193934679031372, + "learning_rate": 6.0829568050614725e-06, + "loss": 1.1252, + "step": 2508 + }, + { + "epoch": 1.8989593188268685, + "grad_norm": 2.3651788234710693, + "learning_rate": 6.075572235814909e-06, + "loss": 1.1242, + "step": 2509 + }, + { + "epoch": 1.8997161778618732, + "grad_norm": 2.107897996902466, + "learning_rate": 6.0681901968392184e-06, + "loss": 1.0937, + "step": 2510 + }, + { + "epoch": 1.9004730368968779, + "grad_norm": 2.57551908493042, + "learning_rate": 6.060810692892748e-06, + "loss": 1.129, + "step": 2511 + }, + { + "epoch": 1.9012298959318827, + "grad_norm": 2.0832760334014893, + "learning_rate": 6.053433728732217e-06, + "loss": 1.138, + "step": 2512 + }, + { + "epoch": 1.9019867549668874, + "grad_norm": 1.9609954357147217, + "learning_rate": 6.046059309112703e-06, + "loss": 1.1404, + "step": 2513 + }, + { + "epoch": 1.902743614001892, + "grad_norm": 2.1895411014556885, + "learning_rate": 6.038687438787642e-06, + "loss": 1.1378, + "step": 2514 + }, + { + "epoch": 1.903500473036897, + "grad_norm": 2.058955430984497, + "learning_rate": 6.031318122508833e-06, + "loss": 1.117, + "step": 2515 + }, + { + "epoch": 1.9042573320719016, + "grad_norm": 2.1496293544769287, + "learning_rate": 6.023951365026426e-06, + "loss": 1.1115, + "step": 2516 + }, + { + "epoch": 1.9050141911069063, + "grad_norm": 2.147587776184082, + "learning_rate": 6.016587171088913e-06, + "loss": 1.1419, + "step": 2517 + }, + { + "epoch": 1.9057710501419112, + "grad_norm": 2.470024824142456, + "learning_rate": 6.009225545443148e-06, + "loss": 1.1518, + "step": 2518 + }, + { + "epoch": 1.9065279091769158, + "grad_norm": 2.0301973819732666, + "learning_rate": 6.001866492834322e-06, + "loss": 1.0815, + "step": 2519 + }, + { + "epoch": 1.9072847682119205, + "grad_norm": 2.2255492210388184, + "learning_rate": 5.994510018005964e-06, + "loss": 1.1555, + "step": 2520 + }, + { + "epoch": 1.9080416272469254, + "grad_norm": 2.101928472518921, + "learning_rate": 5.987156125699951e-06, + "loss": 1.1251, + "step": 2521 + }, + { + "epoch": 1.9087984862819298, + "grad_norm": 2.0817983150482178, + "learning_rate": 5.979804820656483e-06, + "loss": 1.1233, + "step": 2522 + }, + { + "epoch": 1.9095553453169347, + "grad_norm": 2.082615375518799, + "learning_rate": 5.972456107614105e-06, + "loss": 1.1198, + "step": 2523 + }, + { + "epoch": 1.9103122043519396, + "grad_norm": 2.036180257797241, + "learning_rate": 5.965109991309686e-06, + "loss": 1.1056, + "step": 2524 + }, + { + "epoch": 1.911069063386944, + "grad_norm": 2.358384847640991, + "learning_rate": 5.9577664764784126e-06, + "loss": 1.1125, + "step": 2525 + }, + { + "epoch": 1.911825922421949, + "grad_norm": 2.097381830215454, + "learning_rate": 5.950425567853813e-06, + "loss": 1.1394, + "step": 2526 + }, + { + "epoch": 1.9125827814569536, + "grad_norm": 2.0445775985717773, + "learning_rate": 5.943087270167718e-06, + "loss": 1.1276, + "step": 2527 + }, + { + "epoch": 1.9133396404919583, + "grad_norm": 2.2490360736846924, + "learning_rate": 5.935751588150282e-06, + "loss": 1.0963, + "step": 2528 + }, + { + "epoch": 1.9140964995269631, + "grad_norm": 2.212881088256836, + "learning_rate": 5.928418526529981e-06, + "loss": 1.0829, + "step": 2529 + }, + { + "epoch": 1.9148533585619678, + "grad_norm": 1.9197094440460205, + "learning_rate": 5.921088090033585e-06, + "loss": 1.0947, + "step": 2530 + }, + { + "epoch": 1.9156102175969725, + "grad_norm": 2.0829176902770996, + "learning_rate": 5.913760283386186e-06, + "loss": 1.1466, + "step": 2531 + }, + { + "epoch": 1.9163670766319774, + "grad_norm": 2.326220750808716, + "learning_rate": 5.906435111311179e-06, + "loss": 1.131, + "step": 2532 + }, + { + "epoch": 1.917123935666982, + "grad_norm": 2.2894301414489746, + "learning_rate": 5.899112578530255e-06, + "loss": 1.1062, + "step": 2533 + }, + { + "epoch": 1.9178807947019867, + "grad_norm": 2.134059190750122, + "learning_rate": 5.891792689763407e-06, + "loss": 1.1116, + "step": 2534 + }, + { + "epoch": 1.9186376537369916, + "grad_norm": 2.1360747814178467, + "learning_rate": 5.884475449728925e-06, + "loss": 1.15, + "step": 2535 + }, + { + "epoch": 1.9193945127719962, + "grad_norm": 2.3759396076202393, + "learning_rate": 5.877160863143391e-06, + "loss": 1.0696, + "step": 2536 + }, + { + "epoch": 1.920151371807001, + "grad_norm": 2.216271162033081, + "learning_rate": 5.869848934721671e-06, + "loss": 1.166, + "step": 2537 + }, + { + "epoch": 1.9209082308420058, + "grad_norm": 2.0322463512420654, + "learning_rate": 5.86253966917693e-06, + "loss": 1.1031, + "step": 2538 + }, + { + "epoch": 1.9216650898770105, + "grad_norm": 1.9586721658706665, + "learning_rate": 5.855233071220603e-06, + "loss": 1.1062, + "step": 2539 + }, + { + "epoch": 1.9224219489120151, + "grad_norm": 2.202064037322998, + "learning_rate": 5.8479291455624186e-06, + "loss": 1.1295, + "step": 2540 + }, + { + "epoch": 1.92317880794702, + "grad_norm": 2.291038751602173, + "learning_rate": 5.840627896910365e-06, + "loss": 1.157, + "step": 2541 + }, + { + "epoch": 1.9239356669820245, + "grad_norm": 1.989047884941101, + "learning_rate": 5.833329329970726e-06, + "loss": 1.1506, + "step": 2542 + }, + { + "epoch": 1.9246925260170293, + "grad_norm": 1.9984663724899292, + "learning_rate": 5.82603344944804e-06, + "loss": 1.106, + "step": 2543 + }, + { + "epoch": 1.9254493850520342, + "grad_norm": 2.3392581939697266, + "learning_rate": 5.818740260045123e-06, + "loss": 1.1819, + "step": 2544 + }, + { + "epoch": 1.9262062440870387, + "grad_norm": 2.148768663406372, + "learning_rate": 5.811449766463058e-06, + "loss": 1.1439, + "step": 2545 + }, + { + "epoch": 1.9269631031220436, + "grad_norm": 2.033663511276245, + "learning_rate": 5.804161973401175e-06, + "loss": 1.1111, + "step": 2546 + }, + { + "epoch": 1.9277199621570482, + "grad_norm": 2.2173452377319336, + "learning_rate": 5.796876885557084e-06, + "loss": 1.0752, + "step": 2547 + }, + { + "epoch": 1.9284768211920529, + "grad_norm": 2.039340019226074, + "learning_rate": 5.78959450762664e-06, + "loss": 1.0878, + "step": 2548 + }, + { + "epoch": 1.9292336802270578, + "grad_norm": 2.22098445892334, + "learning_rate": 5.782314844303949e-06, + "loss": 1.1109, + "step": 2549 + }, + { + "epoch": 1.9299905392620624, + "grad_norm": 1.9632805585861206, + "learning_rate": 5.775037900281372e-06, + "loss": 1.0981, + "step": 2550 + }, + { + "epoch": 1.930747398297067, + "grad_norm": 2.195981025695801, + "learning_rate": 5.767763680249521e-06, + "loss": 1.0659, + "step": 2551 + }, + { + "epoch": 1.931504257332072, + "grad_norm": 2.3889381885528564, + "learning_rate": 5.760492188897241e-06, + "loss": 1.1027, + "step": 2552 + }, + { + "epoch": 1.9322611163670766, + "grad_norm": 2.212132692337036, + "learning_rate": 5.753223430911625e-06, + "loss": 1.1435, + "step": 2553 + }, + { + "epoch": 1.9330179754020813, + "grad_norm": 2.109678268432617, + "learning_rate": 5.7459574109780105e-06, + "loss": 1.1226, + "step": 2554 + }, + { + "epoch": 1.9337748344370862, + "grad_norm": 2.1192758083343506, + "learning_rate": 5.738694133779954e-06, + "loss": 1.158, + "step": 2555 + }, + { + "epoch": 1.9345316934720909, + "grad_norm": 2.249246597290039, + "learning_rate": 5.7314336039992565e-06, + "loss": 1.1342, + "step": 2556 + }, + { + "epoch": 1.9352885525070955, + "grad_norm": 2.1159563064575195, + "learning_rate": 5.7241758263159504e-06, + "loss": 1.0984, + "step": 2557 + }, + { + "epoch": 1.9360454115421004, + "grad_norm": 2.092686891555786, + "learning_rate": 5.7169208054082794e-06, + "loss": 1.1462, + "step": 2558 + }, + { + "epoch": 1.9368022705771049, + "grad_norm": 2.201853036880493, + "learning_rate": 5.7096685459527235e-06, + "loss": 1.1121, + "step": 2559 + }, + { + "epoch": 1.9375591296121097, + "grad_norm": 2.29725980758667, + "learning_rate": 5.702419052623982e-06, + "loss": 1.17, + "step": 2560 + }, + { + "epoch": 1.9383159886471146, + "grad_norm": 2.636181592941284, + "learning_rate": 5.695172330094961e-06, + "loss": 1.1398, + "step": 2561 + }, + { + "epoch": 1.939072847682119, + "grad_norm": 2.0273807048797607, + "learning_rate": 5.687928383036795e-06, + "loss": 1.0939, + "step": 2562 + }, + { + "epoch": 1.939829706717124, + "grad_norm": 2.2744505405426025, + "learning_rate": 5.680687216118814e-06, + "loss": 1.1233, + "step": 2563 + }, + { + "epoch": 1.9405865657521286, + "grad_norm": 1.976132869720459, + "learning_rate": 5.6734488340085665e-06, + "loss": 1.1573, + "step": 2564 + }, + { + "epoch": 1.9413434247871333, + "grad_norm": 5.837771415710449, + "learning_rate": 5.666213241371809e-06, + "loss": 1.1299, + "step": 2565 + }, + { + "epoch": 1.9421002838221382, + "grad_norm": 2.1729319095611572, + "learning_rate": 5.658980442872484e-06, + "loss": 1.1399, + "step": 2566 + }, + { + "epoch": 1.9428571428571428, + "grad_norm": 2.049954652786255, + "learning_rate": 5.651750443172749e-06, + "loss": 1.1012, + "step": 2567 + }, + { + "epoch": 1.9436140018921475, + "grad_norm": 2.170069694519043, + "learning_rate": 5.644523246932951e-06, + "loss": 1.1276, + "step": 2568 + }, + { + "epoch": 1.9443708609271524, + "grad_norm": 2.143918514251709, + "learning_rate": 5.637298858811633e-06, + "loss": 1.1127, + "step": 2569 + }, + { + "epoch": 1.945127719962157, + "grad_norm": 2.114530324935913, + "learning_rate": 5.6300772834655195e-06, + "loss": 1.0949, + "step": 2570 + }, + { + "epoch": 1.9458845789971617, + "grad_norm": 2.067688226699829, + "learning_rate": 5.6228585255495315e-06, + "loss": 1.1402, + "step": 2571 + }, + { + "epoch": 1.9466414380321666, + "grad_norm": 2.254387855529785, + "learning_rate": 5.615642589716773e-06, + "loss": 1.197, + "step": 2572 + }, + { + "epoch": 1.9473982970671713, + "grad_norm": 2.0530171394348145, + "learning_rate": 5.608429480618519e-06, + "loss": 1.0985, + "step": 2573 + }, + { + "epoch": 1.948155156102176, + "grad_norm": 1.9690866470336914, + "learning_rate": 5.6012192029042354e-06, + "loss": 1.1004, + "step": 2574 + }, + { + "epoch": 1.9489120151371808, + "grad_norm": 2.2864949703216553, + "learning_rate": 5.594011761221554e-06, + "loss": 1.1298, + "step": 2575 + }, + { + "epoch": 1.9496688741721855, + "grad_norm": 2.291849374771118, + "learning_rate": 5.5868071602162875e-06, + "loss": 1.1472, + "step": 2576 + }, + { + "epoch": 1.9504257332071901, + "grad_norm": 2.223792552947998, + "learning_rate": 5.579605404532403e-06, + "loss": 1.1307, + "step": 2577 + }, + { + "epoch": 1.951182592242195, + "grad_norm": 2.0151219367980957, + "learning_rate": 5.572406498812049e-06, + "loss": 1.1281, + "step": 2578 + }, + { + "epoch": 1.9519394512771995, + "grad_norm": 2.023299217224121, + "learning_rate": 5.565210447695529e-06, + "loss": 1.165, + "step": 2579 + }, + { + "epoch": 1.9526963103122044, + "grad_norm": 1.902061939239502, + "learning_rate": 5.5580172558213064e-06, + "loss": 1.061, + "step": 2580 + }, + { + "epoch": 1.953453169347209, + "grad_norm": 2.015148639678955, + "learning_rate": 5.550826927826003e-06, + "loss": 1.0866, + "step": 2581 + }, + { + "epoch": 1.9542100283822137, + "grad_norm": 1.9616479873657227, + "learning_rate": 5.5436394683443996e-06, + "loss": 1.0903, + "step": 2582 + }, + { + "epoch": 1.9549668874172186, + "grad_norm": 1.9738472700119019, + "learning_rate": 5.536454882009412e-06, + "loss": 1.1312, + "step": 2583 + }, + { + "epoch": 1.9557237464522232, + "grad_norm": 2.2209506034851074, + "learning_rate": 5.52927317345213e-06, + "loss": 1.0352, + "step": 2584 + }, + { + "epoch": 1.956480605487228, + "grad_norm": 2.2492170333862305, + "learning_rate": 5.522094347301757e-06, + "loss": 1.0982, + "step": 2585 + }, + { + "epoch": 1.9572374645222328, + "grad_norm": 1.9593442678451538, + "learning_rate": 5.514918408185666e-06, + "loss": 1.1162, + "step": 2586 + }, + { + "epoch": 1.9579943235572375, + "grad_norm": 2.0279417037963867, + "learning_rate": 5.507745360729356e-06, + "loss": 1.1148, + "step": 2587 + }, + { + "epoch": 1.9587511825922421, + "grad_norm": 2.1683051586151123, + "learning_rate": 5.500575209556462e-06, + "loss": 1.1078, + "step": 2588 + }, + { + "epoch": 1.959508041627247, + "grad_norm": 2.0510294437408447, + "learning_rate": 5.493407959288752e-06, + "loss": 1.1099, + "step": 2589 + }, + { + "epoch": 1.9602649006622517, + "grad_norm": 2.144102096557617, + "learning_rate": 5.486243614546135e-06, + "loss": 1.0938, + "step": 2590 + }, + { + "epoch": 1.9610217596972563, + "grad_norm": 1.9423801898956299, + "learning_rate": 5.479082179946628e-06, + "loss": 1.0941, + "step": 2591 + }, + { + "epoch": 1.9617786187322612, + "grad_norm": 1.9970104694366455, + "learning_rate": 5.471923660106387e-06, + "loss": 1.1106, + "step": 2592 + }, + { + "epoch": 1.962535477767266, + "grad_norm": 2.0680718421936035, + "learning_rate": 5.46476805963969e-06, + "loss": 1.1039, + "step": 2593 + }, + { + "epoch": 1.9632923368022706, + "grad_norm": 2.138693332672119, + "learning_rate": 5.457615383158917e-06, + "loss": 1.1283, + "step": 2594 + }, + { + "epoch": 1.9640491958372754, + "grad_norm": 2.0552265644073486, + "learning_rate": 5.450465635274581e-06, + "loss": 1.0889, + "step": 2595 + }, + { + "epoch": 1.9648060548722799, + "grad_norm": 1.974301815032959, + "learning_rate": 5.443318820595303e-06, + "loss": 1.1174, + "step": 2596 + }, + { + "epoch": 1.9655629139072848, + "grad_norm": 1.9770950078964233, + "learning_rate": 5.436174943727803e-06, + "loss": 1.1261, + "step": 2597 + }, + { + "epoch": 1.9663197729422897, + "grad_norm": 2.0605309009552, + "learning_rate": 5.42903400927692e-06, + "loss": 1.1453, + "step": 2598 + }, + { + "epoch": 1.967076631977294, + "grad_norm": 1.9402192831039429, + "learning_rate": 5.421896021845591e-06, + "loss": 1.1047, + "step": 2599 + }, + { + "epoch": 1.967833491012299, + "grad_norm": 2.0428860187530518, + "learning_rate": 5.4147609860348545e-06, + "loss": 1.0957, + "step": 2600 + }, + { + "epoch": 1.9685903500473036, + "grad_norm": 1.9550975561141968, + "learning_rate": 5.407628906443844e-06, + "loss": 1.1202, + "step": 2601 + }, + { + "epoch": 1.9693472090823083, + "grad_norm": 1.9513860940933228, + "learning_rate": 5.400499787669788e-06, + "loss": 1.0562, + "step": 2602 + }, + { + "epoch": 1.9701040681173132, + "grad_norm": 2.2109532356262207, + "learning_rate": 5.393373634308015e-06, + "loss": 1.1637, + "step": 2603 + }, + { + "epoch": 1.9708609271523179, + "grad_norm": 1.9870307445526123, + "learning_rate": 5.3862504509519245e-06, + "loss": 1.1243, + "step": 2604 + }, + { + "epoch": 1.9716177861873225, + "grad_norm": 2.027862310409546, + "learning_rate": 5.379130242193018e-06, + "loss": 1.0791, + "step": 2605 + }, + { + "epoch": 1.9723746452223274, + "grad_norm": 1.969875454902649, + "learning_rate": 5.372013012620875e-06, + "loss": 1.1593, + "step": 2606 + }, + { + "epoch": 1.973131504257332, + "grad_norm": 2.163132667541504, + "learning_rate": 5.3648987668231475e-06, + "loss": 1.1337, + "step": 2607 + }, + { + "epoch": 1.9738883632923367, + "grad_norm": 2.0548205375671387, + "learning_rate": 5.357787509385571e-06, + "loss": 1.1185, + "step": 2608 + }, + { + "epoch": 1.9746452223273416, + "grad_norm": 2.120103597640991, + "learning_rate": 5.350679244891962e-06, + "loss": 1.1419, + "step": 2609 + }, + { + "epoch": 1.9754020813623463, + "grad_norm": 2.1263537406921387, + "learning_rate": 5.343573977924188e-06, + "loss": 1.1043, + "step": 2610 + }, + { + "epoch": 1.976158940397351, + "grad_norm": 2.0377280712127686, + "learning_rate": 5.3364717130622e-06, + "loss": 1.0852, + "step": 2611 + }, + { + "epoch": 1.9769157994323558, + "grad_norm": 1.9558144807815552, + "learning_rate": 5.329372454884014e-06, + "loss": 1.1432, + "step": 2612 + }, + { + "epoch": 1.9776726584673603, + "grad_norm": 2.1129062175750732, + "learning_rate": 5.322276207965698e-06, + "loss": 1.1347, + "step": 2613 + }, + { + "epoch": 1.9784295175023652, + "grad_norm": 2.042936086654663, + "learning_rate": 5.315182976881382e-06, + "loss": 1.1201, + "step": 2614 + }, + { + "epoch": 1.97918637653737, + "grad_norm": 2.1350150108337402, + "learning_rate": 5.308092766203265e-06, + "loss": 1.0633, + "step": 2615 + }, + { + "epoch": 1.9799432355723745, + "grad_norm": 1.984386682510376, + "learning_rate": 5.301005580501579e-06, + "loss": 1.1045, + "step": 2616 + }, + { + "epoch": 1.9807000946073794, + "grad_norm": 1.9831804037094116, + "learning_rate": 5.293921424344624e-06, + "loss": 1.1376, + "step": 2617 + }, + { + "epoch": 1.981456953642384, + "grad_norm": 2.1934800148010254, + "learning_rate": 5.286840302298729e-06, + "loss": 1.1043, + "step": 2618 + }, + { + "epoch": 1.9822138126773887, + "grad_norm": 2.0572476387023926, + "learning_rate": 5.2797622189282835e-06, + "loss": 1.0807, + "step": 2619 + }, + { + "epoch": 1.9829706717123936, + "grad_norm": 2.0445570945739746, + "learning_rate": 5.272687178795715e-06, + "loss": 1.1075, + "step": 2620 + }, + { + "epoch": 1.9837275307473983, + "grad_norm": 2.0021796226501465, + "learning_rate": 5.265615186461479e-06, + "loss": 1.0971, + "step": 2621 + }, + { + "epoch": 1.984484389782403, + "grad_norm": 2.107959508895874, + "learning_rate": 5.258546246484077e-06, + "loss": 1.1329, + "step": 2622 + }, + { + "epoch": 1.9852412488174078, + "grad_norm": 1.8930269479751587, + "learning_rate": 5.251480363420041e-06, + "loss": 1.0927, + "step": 2623 + }, + { + "epoch": 1.9859981078524125, + "grad_norm": 2.0325968265533447, + "learning_rate": 5.244417541823935e-06, + "loss": 1.1009, + "step": 2624 + }, + { + "epoch": 1.9867549668874172, + "grad_norm": 2.3380300998687744, + "learning_rate": 5.237357786248337e-06, + "loss": 1.0821, + "step": 2625 + }, + { + "epoch": 1.987511825922422, + "grad_norm": 2.121659517288208, + "learning_rate": 5.230301101243864e-06, + "loss": 1.0595, + "step": 2626 + }, + { + "epoch": 1.9882686849574267, + "grad_norm": 2.329930067062378, + "learning_rate": 5.22324749135915e-06, + "loss": 1.1233, + "step": 2627 + }, + { + "epoch": 1.9890255439924314, + "grad_norm": 2.044088840484619, + "learning_rate": 5.216196961140837e-06, + "loss": 1.1064, + "step": 2628 + }, + { + "epoch": 1.9897824030274363, + "grad_norm": 2.201205015182495, + "learning_rate": 5.209149515133593e-06, + "loss": 1.1553, + "step": 2629 + }, + { + "epoch": 1.990539262062441, + "grad_norm": 2.029348850250244, + "learning_rate": 5.202105157880095e-06, + "loss": 1.119, + "step": 2630 + }, + { + "epoch": 1.9912961210974456, + "grad_norm": 2.332625150680542, + "learning_rate": 5.1950638939210296e-06, + "loss": 1.0767, + "step": 2631 + }, + { + "epoch": 1.9920529801324505, + "grad_norm": 2.2954776287078857, + "learning_rate": 5.188025727795084e-06, + "loss": 1.0942, + "step": 2632 + }, + { + "epoch": 1.992809839167455, + "grad_norm": 1.9560728073120117, + "learning_rate": 5.180990664038954e-06, + "loss": 1.0801, + "step": 2633 + }, + { + "epoch": 1.9935666982024598, + "grad_norm": 2.1281726360321045, + "learning_rate": 5.17395870718734e-06, + "loss": 1.1274, + "step": 2634 + }, + { + "epoch": 1.9943235572374647, + "grad_norm": 1.8601038455963135, + "learning_rate": 5.166929861772925e-06, + "loss": 1.0999, + "step": 2635 + }, + { + "epoch": 1.9950804162724691, + "grad_norm": 2.056415319442749, + "learning_rate": 5.159904132326399e-06, + "loss": 1.0721, + "step": 2636 + }, + { + "epoch": 1.995837275307474, + "grad_norm": 1.9574776887893677, + "learning_rate": 5.152881523376445e-06, + "loss": 1.1364, + "step": 2637 + }, + { + "epoch": 1.9965941343424787, + "grad_norm": 2.011434555053711, + "learning_rate": 5.145862039449723e-06, + "loss": 1.1044, + "step": 2638 + }, + { + "epoch": 1.9973509933774833, + "grad_norm": 1.9725828170776367, + "learning_rate": 5.138845685070891e-06, + "loss": 1.1425, + "step": 2639 + }, + { + "epoch": 1.9981078524124882, + "grad_norm": 2.337836742401123, + "learning_rate": 5.131832464762576e-06, + "loss": 1.1062, + "step": 2640 + }, + { + "epoch": 1.998864711447493, + "grad_norm": 2.0006585121154785, + "learning_rate": 5.1248223830454e-06, + "loss": 1.1117, + "step": 2641 + }, + { + "epoch": 1.9996215704824976, + "grad_norm": 2.063220977783203, + "learning_rate": 5.117815444437956e-06, + "loss": 1.0999, + "step": 2642 + }, + { + "epoch": 2.0003784295175024, + "grad_norm": 2.052854537963867, + "learning_rate": 5.110811653456801e-06, + "loss": 1.099, + "step": 2643 + }, + { + "epoch": 2.001135288552507, + "grad_norm": 2.046462059020996, + "learning_rate": 5.103811014616479e-06, + "loss": 1.0824, + "step": 2644 + }, + { + "epoch": 2.0018921475875118, + "grad_norm": 2.0290746688842773, + "learning_rate": 5.096813532429496e-06, + "loss": 1.0797, + "step": 2645 + }, + { + "epoch": 2.0026490066225167, + "grad_norm": 2.0245625972747803, + "learning_rate": 5.089819211406316e-06, + "loss": 1.078, + "step": 2646 + }, + { + "epoch": 2.003405865657521, + "grad_norm": 2.207991600036621, + "learning_rate": 5.082828056055375e-06, + "loss": 1.0669, + "step": 2647 + }, + { + "epoch": 2.004162724692526, + "grad_norm": 2.133127212524414, + "learning_rate": 5.075840070883069e-06, + "loss": 1.0906, + "step": 2648 + }, + { + "epoch": 2.004919583727531, + "grad_norm": 2.1217539310455322, + "learning_rate": 5.068855260393739e-06, + "loss": 1.1084, + "step": 2649 + }, + { + "epoch": 2.0056764427625353, + "grad_norm": 2.1101694107055664, + "learning_rate": 5.061873629089693e-06, + "loss": 1.0727, + "step": 2650 + }, + { + "epoch": 2.00643330179754, + "grad_norm": 2.0910747051239014, + "learning_rate": 5.054895181471185e-06, + "loss": 1.0533, + "step": 2651 + }, + { + "epoch": 2.007190160832545, + "grad_norm": 1.8526837825775146, + "learning_rate": 5.0479199220364085e-06, + "loss": 1.0245, + "step": 2652 + }, + { + "epoch": 2.0079470198675495, + "grad_norm": 2.0023531913757324, + "learning_rate": 5.040947855281515e-06, + "loss": 1.096, + "step": 2653 + }, + { + "epoch": 2.0087038789025544, + "grad_norm": 1.9686319828033447, + "learning_rate": 5.033978985700592e-06, + "loss": 1.0526, + "step": 2654 + }, + { + "epoch": 2.0094607379375593, + "grad_norm": 1.999506950378418, + "learning_rate": 5.02701331778567e-06, + "loss": 1.0875, + "step": 2655 + }, + { + "epoch": 2.0102175969725637, + "grad_norm": 2.096315860748291, + "learning_rate": 5.020050856026703e-06, + "loss": 1.0637, + "step": 2656 + }, + { + "epoch": 2.0109744560075686, + "grad_norm": 2.0473968982696533, + "learning_rate": 5.013091604911594e-06, + "loss": 1.1281, + "step": 2657 + }, + { + "epoch": 2.0117313150425735, + "grad_norm": 2.0134975910186768, + "learning_rate": 5.006135568926175e-06, + "loss": 1.0641, + "step": 2658 + }, + { + "epoch": 2.012488174077578, + "grad_norm": 1.9990431070327759, + "learning_rate": 4.999182752554189e-06, + "loss": 1.1031, + "step": 2659 + }, + { + "epoch": 2.013245033112583, + "grad_norm": 2.199671983718872, + "learning_rate": 4.992233160277321e-06, + "loss": 1.1043, + "step": 2660 + }, + { + "epoch": 2.0140018921475873, + "grad_norm": 2.0332555770874023, + "learning_rate": 4.985286796575174e-06, + "loss": 1.1057, + "step": 2661 + }, + { + "epoch": 2.014758751182592, + "grad_norm": 2.1199121475219727, + "learning_rate": 4.978343665925269e-06, + "loss": 1.036, + "step": 2662 + }, + { + "epoch": 2.015515610217597, + "grad_norm": 2.047947883605957, + "learning_rate": 4.9714037728030415e-06, + "loss": 1.0934, + "step": 2663 + }, + { + "epoch": 2.0162724692526015, + "grad_norm": 1.814427137374878, + "learning_rate": 4.964467121681834e-06, + "loss": 1.0809, + "step": 2664 + }, + { + "epoch": 2.0170293282876064, + "grad_norm": 2.189452648162842, + "learning_rate": 4.957533717032911e-06, + "loss": 1.0565, + "step": 2665 + }, + { + "epoch": 2.0177861873226113, + "grad_norm": 2.025991201400757, + "learning_rate": 4.95060356332544e-06, + "loss": 1.0633, + "step": 2666 + }, + { + "epoch": 2.0185430463576157, + "grad_norm": 2.3097431659698486, + "learning_rate": 4.943676665026492e-06, + "loss": 1.0527, + "step": 2667 + }, + { + "epoch": 2.0192999053926206, + "grad_norm": 2.235900402069092, + "learning_rate": 4.936753026601047e-06, + "loss": 1.0878, + "step": 2668 + }, + { + "epoch": 2.0200567644276255, + "grad_norm": 2.0237877368927, + "learning_rate": 4.929832652511963e-06, + "loss": 1.0243, + "step": 2669 + }, + { + "epoch": 2.02081362346263, + "grad_norm": 2.148148536682129, + "learning_rate": 4.922915547220014e-06, + "loss": 1.05, + "step": 2670 + }, + { + "epoch": 2.021570482497635, + "grad_norm": 2.204345464706421, + "learning_rate": 4.91600171518386e-06, + "loss": 1.0613, + "step": 2671 + }, + { + "epoch": 2.0223273415326397, + "grad_norm": 2.05426287651062, + "learning_rate": 4.909091160860053e-06, + "loss": 1.0683, + "step": 2672 + }, + { + "epoch": 2.023084200567644, + "grad_norm": 2.0507991313934326, + "learning_rate": 4.902183888703029e-06, + "loss": 1.1039, + "step": 2673 + }, + { + "epoch": 2.023841059602649, + "grad_norm": 2.111011505126953, + "learning_rate": 4.895279903165118e-06, + "loss": 1.0708, + "step": 2674 + }, + { + "epoch": 2.024597918637654, + "grad_norm": 2.152397871017456, + "learning_rate": 4.888379208696516e-06, + "loss": 1.135, + "step": 2675 + }, + { + "epoch": 2.0253547776726584, + "grad_norm": 2.062863826751709, + "learning_rate": 4.881481809745303e-06, + "loss": 1.0808, + "step": 2676 + }, + { + "epoch": 2.0261116367076633, + "grad_norm": 2.052548885345459, + "learning_rate": 4.874587710757442e-06, + "loss": 1.0729, + "step": 2677 + }, + { + "epoch": 2.026868495742668, + "grad_norm": 2.1231849193573, + "learning_rate": 4.8676969161767625e-06, + "loss": 1.13, + "step": 2678 + }, + { + "epoch": 2.0276253547776726, + "grad_norm": 1.9931443929672241, + "learning_rate": 4.860809430444969e-06, + "loss": 1.0863, + "step": 2679 + }, + { + "epoch": 2.0283822138126775, + "grad_norm": 1.980806589126587, + "learning_rate": 4.853925258001626e-06, + "loss": 1.1208, + "step": 2680 + }, + { + "epoch": 2.029139072847682, + "grad_norm": 2.0515875816345215, + "learning_rate": 4.847044403284166e-06, + "loss": 1.0301, + "step": 2681 + }, + { + "epoch": 2.029895931882687, + "grad_norm": 2.115715742111206, + "learning_rate": 4.840166870727887e-06, + "loss": 1.0677, + "step": 2682 + }, + { + "epoch": 2.0306527909176917, + "grad_norm": 1.9753094911575317, + "learning_rate": 4.833292664765935e-06, + "loss": 1.0814, + "step": 2683 + }, + { + "epoch": 2.031409649952696, + "grad_norm": 2.0974655151367188, + "learning_rate": 4.8264217898293226e-06, + "loss": 1.0553, + "step": 2684 + }, + { + "epoch": 2.032166508987701, + "grad_norm": 1.9045485258102417, + "learning_rate": 4.8195542503469145e-06, + "loss": 1.1018, + "step": 2685 + }, + { + "epoch": 2.032923368022706, + "grad_norm": 2.1343581676483154, + "learning_rate": 4.812690050745413e-06, + "loss": 1.1279, + "step": 2686 + }, + { + "epoch": 2.0336802270577103, + "grad_norm": 2.060368776321411, + "learning_rate": 4.805829195449382e-06, + "loss": 1.1021, + "step": 2687 + }, + { + "epoch": 2.0344370860927152, + "grad_norm": 2.2184059619903564, + "learning_rate": 4.798971688881224e-06, + "loss": 1.0819, + "step": 2688 + }, + { + "epoch": 2.03519394512772, + "grad_norm": 2.1077048778533936, + "learning_rate": 4.792117535461187e-06, + "loss": 1.0469, + "step": 2689 + }, + { + "epoch": 2.0359508041627246, + "grad_norm": 2.105867624282837, + "learning_rate": 4.7852667396073475e-06, + "loss": 1.1129, + "step": 2690 + }, + { + "epoch": 2.0367076631977294, + "grad_norm": 1.9376499652862549, + "learning_rate": 4.7784193057356234e-06, + "loss": 1.0889, + "step": 2691 + }, + { + "epoch": 2.0374645222327343, + "grad_norm": 2.1429734230041504, + "learning_rate": 4.771575238259769e-06, + "loss": 1.06, + "step": 2692 + }, + { + "epoch": 2.0382213812677388, + "grad_norm": 2.1066880226135254, + "learning_rate": 4.764734541591365e-06, + "loss": 1.0841, + "step": 2693 + }, + { + "epoch": 2.0389782403027437, + "grad_norm": 2.034998655319214, + "learning_rate": 4.757897220139822e-06, + "loss": 1.1155, + "step": 2694 + }, + { + "epoch": 2.0397350993377485, + "grad_norm": 2.0084969997406006, + "learning_rate": 4.751063278312371e-06, + "loss": 1.0365, + "step": 2695 + }, + { + "epoch": 2.040491958372753, + "grad_norm": 2.260364055633545, + "learning_rate": 4.744232720514074e-06, + "loss": 1.0722, + "step": 2696 + }, + { + "epoch": 2.041248817407758, + "grad_norm": 2.0515432357788086, + "learning_rate": 4.737405551147791e-06, + "loss": 1.1263, + "step": 2697 + }, + { + "epoch": 2.0420056764427623, + "grad_norm": 2.0565128326416016, + "learning_rate": 4.7305817746142186e-06, + "loss": 1.0697, + "step": 2698 + }, + { + "epoch": 2.042762535477767, + "grad_norm": 2.287740707397461, + "learning_rate": 4.723761395311858e-06, + "loss": 1.0616, + "step": 2699 + }, + { + "epoch": 2.043519394512772, + "grad_norm": 2.0816521644592285, + "learning_rate": 4.716944417637024e-06, + "loss": 1.0425, + "step": 2700 + }, + { + "epoch": 2.0442762535477765, + "grad_norm": 2.117865562438965, + "learning_rate": 4.710130845983837e-06, + "loss": 1.1141, + "step": 2701 + }, + { + "epoch": 2.0450331125827814, + "grad_norm": 1.9112534523010254, + "learning_rate": 4.703320684744216e-06, + "loss": 1.06, + "step": 2702 + }, + { + "epoch": 2.0457899716177863, + "grad_norm": 2.1456851959228516, + "learning_rate": 4.696513938307894e-06, + "loss": 1.0512, + "step": 2703 + }, + { + "epoch": 2.0465468306527907, + "grad_norm": 2.6872496604919434, + "learning_rate": 4.689710611062389e-06, + "loss": 1.0907, + "step": 2704 + }, + { + "epoch": 2.0473036896877956, + "grad_norm": 2.116586446762085, + "learning_rate": 4.682910707393024e-06, + "loss": 1.1179, + "step": 2705 + }, + { + "epoch": 2.0480605487228005, + "grad_norm": 2.1400527954101562, + "learning_rate": 4.676114231682915e-06, + "loss": 1.0673, + "step": 2706 + }, + { + "epoch": 2.048817407757805, + "grad_norm": 2.1281938552856445, + "learning_rate": 4.669321188312969e-06, + "loss": 1.0719, + "step": 2707 + }, + { + "epoch": 2.04957426679281, + "grad_norm": 2.1432082653045654, + "learning_rate": 4.662531581661873e-06, + "loss": 1.0844, + "step": 2708 + }, + { + "epoch": 2.0503311258278147, + "grad_norm": 1.9638357162475586, + "learning_rate": 4.655745416106105e-06, + "loss": 1.0379, + "step": 2709 + }, + { + "epoch": 2.051087984862819, + "grad_norm": 2.069023609161377, + "learning_rate": 4.648962696019928e-06, + "loss": 1.0808, + "step": 2710 + }, + { + "epoch": 2.051844843897824, + "grad_norm": 1.974176287651062, + "learning_rate": 4.6421834257753745e-06, + "loss": 1.1431, + "step": 2711 + }, + { + "epoch": 2.052601702932829, + "grad_norm": 2.0162038803100586, + "learning_rate": 4.635407609742265e-06, + "loss": 1.0715, + "step": 2712 + }, + { + "epoch": 2.0533585619678334, + "grad_norm": 2.0583693981170654, + "learning_rate": 4.628635252288178e-06, + "loss": 1.0583, + "step": 2713 + }, + { + "epoch": 2.0541154210028383, + "grad_norm": 1.934477686882019, + "learning_rate": 4.621866357778479e-06, + "loss": 1.071, + "step": 2714 + }, + { + "epoch": 2.054872280037843, + "grad_norm": 2.2252588272094727, + "learning_rate": 4.61510093057629e-06, + "loss": 1.1205, + "step": 2715 + }, + { + "epoch": 2.0556291390728476, + "grad_norm": 2.0399527549743652, + "learning_rate": 4.608338975042509e-06, + "loss": 1.1, + "step": 2716 + }, + { + "epoch": 2.0563859981078525, + "grad_norm": 2.1194961071014404, + "learning_rate": 4.601580495535781e-06, + "loss": 1.0566, + "step": 2717 + }, + { + "epoch": 2.057142857142857, + "grad_norm": 2.1798765659332275, + "learning_rate": 4.594825496412527e-06, + "loss": 1.07, + "step": 2718 + }, + { + "epoch": 2.057899716177862, + "grad_norm": 2.00516414642334, + "learning_rate": 4.588073982026908e-06, + "loss": 1.0404, + "step": 2719 + }, + { + "epoch": 2.0586565752128667, + "grad_norm": 2.1549298763275146, + "learning_rate": 4.581325956730851e-06, + "loss": 1.0873, + "step": 2720 + }, + { + "epoch": 2.059413434247871, + "grad_norm": 2.3754074573516846, + "learning_rate": 4.574581424874031e-06, + "loss": 1.0917, + "step": 2721 + }, + { + "epoch": 2.060170293282876, + "grad_norm": 2.118363857269287, + "learning_rate": 4.56784039080387e-06, + "loss": 1.0864, + "step": 2722 + }, + { + "epoch": 2.060927152317881, + "grad_norm": 1.9879770278930664, + "learning_rate": 4.561102858865542e-06, + "loss": 1.047, + "step": 2723 + }, + { + "epoch": 2.0616840113528854, + "grad_norm": 2.0962250232696533, + "learning_rate": 4.554368833401944e-06, + "loss": 1.0803, + "step": 2724 + }, + { + "epoch": 2.0624408703878903, + "grad_norm": 2.095574378967285, + "learning_rate": 4.547638318753733e-06, + "loss": 1.1101, + "step": 2725 + }, + { + "epoch": 2.063197729422895, + "grad_norm": 2.2542734146118164, + "learning_rate": 4.540911319259297e-06, + "loss": 1.0672, + "step": 2726 + }, + { + "epoch": 2.0639545884578996, + "grad_norm": 2.1071441173553467, + "learning_rate": 4.534187839254755e-06, + "loss": 1.0295, + "step": 2727 + }, + { + "epoch": 2.0647114474929045, + "grad_norm": 2.2289743423461914, + "learning_rate": 4.527467883073962e-06, + "loss": 1.0759, + "step": 2728 + }, + { + "epoch": 2.0654683065279094, + "grad_norm": 2.20210862159729, + "learning_rate": 4.520751455048502e-06, + "loss": 1.0952, + "step": 2729 + }, + { + "epoch": 2.066225165562914, + "grad_norm": 2.0890111923217773, + "learning_rate": 4.5140385595076795e-06, + "loss": 1.1055, + "step": 2730 + }, + { + "epoch": 2.0669820245979187, + "grad_norm": 2.0093884468078613, + "learning_rate": 4.507329200778518e-06, + "loss": 1.0836, + "step": 2731 + }, + { + "epoch": 2.0677388836329236, + "grad_norm": 2.171649932861328, + "learning_rate": 4.500623383185774e-06, + "loss": 1.0794, + "step": 2732 + }, + { + "epoch": 2.068495742667928, + "grad_norm": 1.8796758651733398, + "learning_rate": 4.493921111051916e-06, + "loss": 1.084, + "step": 2733 + }, + { + "epoch": 2.069252601702933, + "grad_norm": 2.1491153240203857, + "learning_rate": 4.487222388697128e-06, + "loss": 1.0629, + "step": 2734 + }, + { + "epoch": 2.0700094607379373, + "grad_norm": 2.3227274417877197, + "learning_rate": 4.4805272204392965e-06, + "loss": 1.0901, + "step": 2735 + }, + { + "epoch": 2.0707663197729422, + "grad_norm": 2.256610631942749, + "learning_rate": 4.47383561059403e-06, + "loss": 1.1046, + "step": 2736 + }, + { + "epoch": 2.071523178807947, + "grad_norm": 1.9754210710525513, + "learning_rate": 4.467147563474642e-06, + "loss": 1.0433, + "step": 2737 + }, + { + "epoch": 2.0722800378429516, + "grad_norm": 2.2307772636413574, + "learning_rate": 4.460463083392139e-06, + "loss": 1.079, + "step": 2738 + }, + { + "epoch": 2.0730368968779564, + "grad_norm": 1.8400083780288696, + "learning_rate": 4.453782174655236e-06, + "loss": 1.0615, + "step": 2739 + }, + { + "epoch": 2.0737937559129613, + "grad_norm": 1.9424253702163696, + "learning_rate": 4.447104841570351e-06, + "loss": 1.0318, + "step": 2740 + }, + { + "epoch": 2.0745506149479658, + "grad_norm": 2.008769989013672, + "learning_rate": 4.440431088441582e-06, + "loss": 1.0861, + "step": 2741 + }, + { + "epoch": 2.0753074739829707, + "grad_norm": 2.024463415145874, + "learning_rate": 4.4337609195707325e-06, + "loss": 1.0406, + "step": 2742 + }, + { + "epoch": 2.0760643330179755, + "grad_norm": 2.0557620525360107, + "learning_rate": 4.4270943392572924e-06, + "loss": 1.0983, + "step": 2743 + }, + { + "epoch": 2.07682119205298, + "grad_norm": 2.7445693016052246, + "learning_rate": 4.420431351798441e-06, + "loss": 1.1253, + "step": 2744 + }, + { + "epoch": 2.077578051087985, + "grad_norm": 2.1181790828704834, + "learning_rate": 4.413771961489035e-06, + "loss": 1.0808, + "step": 2745 + }, + { + "epoch": 2.0783349101229898, + "grad_norm": 2.035220146179199, + "learning_rate": 4.4071161726216116e-06, + "loss": 1.0242, + "step": 2746 + }, + { + "epoch": 2.079091769157994, + "grad_norm": 2.0690808296203613, + "learning_rate": 4.4004639894863945e-06, + "loss": 1.072, + "step": 2747 + }, + { + "epoch": 2.079848628192999, + "grad_norm": 2.145026683807373, + "learning_rate": 4.39381541637128e-06, + "loss": 1.0455, + "step": 2748 + }, + { + "epoch": 2.080605487228004, + "grad_norm": 1.9162312746047974, + "learning_rate": 4.387170457561837e-06, + "loss": 1.073, + "step": 2749 + }, + { + "epoch": 2.0813623462630084, + "grad_norm": 2.1280105113983154, + "learning_rate": 4.380529117341305e-06, + "loss": 1.151, + "step": 2750 + }, + { + "epoch": 2.0821192052980133, + "grad_norm": 2.1977860927581787, + "learning_rate": 4.373891399990595e-06, + "loss": 1.0732, + "step": 2751 + }, + { + "epoch": 2.0828760643330178, + "grad_norm": 2.101363182067871, + "learning_rate": 4.367257309788268e-06, + "loss": 1.0534, + "step": 2752 + }, + { + "epoch": 2.0836329233680226, + "grad_norm": 2.0512733459472656, + "learning_rate": 4.360626851010562e-06, + "loss": 1.0613, + "step": 2753 + }, + { + "epoch": 2.0843897824030275, + "grad_norm": 2.4588279724121094, + "learning_rate": 4.35400002793137e-06, + "loss": 1.058, + "step": 2754 + }, + { + "epoch": 2.085146641438032, + "grad_norm": 2.0685465335845947, + "learning_rate": 4.347376844822242e-06, + "loss": 1.0532, + "step": 2755 + }, + { + "epoch": 2.085903500473037, + "grad_norm": 2.1436235904693604, + "learning_rate": 4.340757305952384e-06, + "loss": 1.0618, + "step": 2756 + }, + { + "epoch": 2.0866603595080417, + "grad_norm": 2.1337039470672607, + "learning_rate": 4.334141415588644e-06, + "loss": 1.0852, + "step": 2757 + }, + { + "epoch": 2.087417218543046, + "grad_norm": 1.9831031560897827, + "learning_rate": 4.3275291779955245e-06, + "loss": 1.1146, + "step": 2758 + }, + { + "epoch": 2.088174077578051, + "grad_norm": 2.2047150135040283, + "learning_rate": 4.320920597435174e-06, + "loss": 1.0817, + "step": 2759 + }, + { + "epoch": 2.088930936613056, + "grad_norm": 2.065563201904297, + "learning_rate": 4.3143156781673846e-06, + "loss": 1.1424, + "step": 2760 + }, + { + "epoch": 2.0896877956480604, + "grad_norm": 2.1660046577453613, + "learning_rate": 4.307714424449583e-06, + "loss": 1.1019, + "step": 2761 + }, + { + "epoch": 2.0904446546830653, + "grad_norm": 2.0400032997131348, + "learning_rate": 4.301116840536844e-06, + "loss": 1.0345, + "step": 2762 + }, + { + "epoch": 2.09120151371807, + "grad_norm": 1.9771476984024048, + "learning_rate": 4.29452293068186e-06, + "loss": 1.0576, + "step": 2763 + }, + { + "epoch": 2.0919583727530746, + "grad_norm": 2.2237024307250977, + "learning_rate": 4.287932699134973e-06, + "loss": 1.0808, + "step": 2764 + }, + { + "epoch": 2.0927152317880795, + "grad_norm": 2.0638787746429443, + "learning_rate": 4.281346150144139e-06, + "loss": 1.0315, + "step": 2765 + }, + { + "epoch": 2.0934720908230844, + "grad_norm": 2.370335817337036, + "learning_rate": 4.27476328795495e-06, + "loss": 1.1243, + "step": 2766 + }, + { + "epoch": 2.094228949858089, + "grad_norm": 2.069380521774292, + "learning_rate": 4.268184116810623e-06, + "loss": 1.0801, + "step": 2767 + }, + { + "epoch": 2.0949858088930937, + "grad_norm": 2.310030221939087, + "learning_rate": 4.261608640951981e-06, + "loss": 1.0666, + "step": 2768 + }, + { + "epoch": 2.0957426679280986, + "grad_norm": 2.0185890197753906, + "learning_rate": 4.255036864617483e-06, + "loss": 1.0817, + "step": 2769 + }, + { + "epoch": 2.096499526963103, + "grad_norm": 2.0323379039764404, + "learning_rate": 4.248468792043194e-06, + "loss": 1.057, + "step": 2770 + }, + { + "epoch": 2.097256385998108, + "grad_norm": 2.362914562225342, + "learning_rate": 4.241904427462797e-06, + "loss": 1.0846, + "step": 2771 + }, + { + "epoch": 2.0980132450331124, + "grad_norm": 2.203740119934082, + "learning_rate": 4.235343775107575e-06, + "loss": 1.0565, + "step": 2772 + }, + { + "epoch": 2.0987701040681173, + "grad_norm": 2.006248712539673, + "learning_rate": 4.22878683920643e-06, + "loss": 1.1263, + "step": 2773 + }, + { + "epoch": 2.099526963103122, + "grad_norm": 2.120445489883423, + "learning_rate": 4.222233623985858e-06, + "loss": 1.0629, + "step": 2774 + }, + { + "epoch": 2.1002838221381266, + "grad_norm": 2.015179395675659, + "learning_rate": 4.2156841336699625e-06, + "loss": 1.0304, + "step": 2775 + }, + { + "epoch": 2.1010406811731315, + "grad_norm": 2.1381468772888184, + "learning_rate": 4.209138372480447e-06, + "loss": 1.0669, + "step": 2776 + }, + { + "epoch": 2.1017975402081364, + "grad_norm": 2.2807891368865967, + "learning_rate": 4.202596344636609e-06, + "loss": 1.0635, + "step": 2777 + }, + { + "epoch": 2.102554399243141, + "grad_norm": 2.1004843711853027, + "learning_rate": 4.196058054355347e-06, + "loss": 1.1306, + "step": 2778 + }, + { + "epoch": 2.1033112582781457, + "grad_norm": 2.092963695526123, + "learning_rate": 4.189523505851129e-06, + "loss": 1.0561, + "step": 2779 + }, + { + "epoch": 2.1040681173131506, + "grad_norm": 2.0627875328063965, + "learning_rate": 4.1829927033360314e-06, + "loss": 1.0671, + "step": 2780 + }, + { + "epoch": 2.104824976348155, + "grad_norm": 2.0852344036102295, + "learning_rate": 4.17646565101971e-06, + "loss": 1.056, + "step": 2781 + }, + { + "epoch": 2.10558183538316, + "grad_norm": 2.0920495986938477, + "learning_rate": 4.1699423531094065e-06, + "loss": 1.0415, + "step": 2782 + }, + { + "epoch": 2.106338694418165, + "grad_norm": 2.3179705142974854, + "learning_rate": 4.163422813809934e-06, + "loss": 1.0648, + "step": 2783 + }, + { + "epoch": 2.1070955534531692, + "grad_norm": 2.0878725051879883, + "learning_rate": 4.156907037323696e-06, + "loss": 1.0996, + "step": 2784 + }, + { + "epoch": 2.107852412488174, + "grad_norm": 2.1616759300231934, + "learning_rate": 4.1503950278506565e-06, + "loss": 1.0473, + "step": 2785 + }, + { + "epoch": 2.108609271523179, + "grad_norm": 2.314814805984497, + "learning_rate": 4.1438867895883555e-06, + "loss": 1.0717, + "step": 2786 + }, + { + "epoch": 2.1093661305581834, + "grad_norm": 2.105376958847046, + "learning_rate": 4.137382326731906e-06, + "loss": 1.1177, + "step": 2787 + }, + { + "epoch": 2.1101229895931883, + "grad_norm": 2.18996000289917, + "learning_rate": 4.130881643473987e-06, + "loss": 1.0923, + "step": 2788 + }, + { + "epoch": 2.1108798486281932, + "grad_norm": 2.0627288818359375, + "learning_rate": 4.124384744004844e-06, + "loss": 1.0307, + "step": 2789 + }, + { + "epoch": 2.1116367076631977, + "grad_norm": 2.2541861534118652, + "learning_rate": 4.117891632512271e-06, + "loss": 1.0543, + "step": 2790 + }, + { + "epoch": 2.1123935666982026, + "grad_norm": 2.0544228553771973, + "learning_rate": 4.111402313181631e-06, + "loss": 1.0987, + "step": 2791 + }, + { + "epoch": 2.113150425733207, + "grad_norm": 2.1496474742889404, + "learning_rate": 4.1049167901958454e-06, + "loss": 1.0422, + "step": 2792 + }, + { + "epoch": 2.113907284768212, + "grad_norm": 2.1363749504089355, + "learning_rate": 4.098435067735377e-06, + "loss": 1.0371, + "step": 2793 + }, + { + "epoch": 2.1146641438032168, + "grad_norm": 2.166128635406494, + "learning_rate": 4.091957149978247e-06, + "loss": 1.056, + "step": 2794 + }, + { + "epoch": 2.115421002838221, + "grad_norm": 2.3086111545562744, + "learning_rate": 4.085483041100028e-06, + "loss": 1.0582, + "step": 2795 + }, + { + "epoch": 2.116177861873226, + "grad_norm": 2.0368103981018066, + "learning_rate": 4.079012745273822e-06, + "loss": 0.9679, + "step": 2796 + }, + { + "epoch": 2.116934720908231, + "grad_norm": 2.146679639816284, + "learning_rate": 4.072546266670289e-06, + "loss": 1.0472, + "step": 2797 + }, + { + "epoch": 2.1176915799432354, + "grad_norm": 2.188101291656494, + "learning_rate": 4.0660836094576215e-06, + "loss": 1.1283, + "step": 2798 + }, + { + "epoch": 2.1184484389782403, + "grad_norm": 2.099888563156128, + "learning_rate": 4.059624777801554e-06, + "loss": 1.0708, + "step": 2799 + }, + { + "epoch": 2.119205298013245, + "grad_norm": 2.088252544403076, + "learning_rate": 4.053169775865346e-06, + "loss": 1.0619, + "step": 2800 + }, + { + "epoch": 2.1199621570482496, + "grad_norm": 2.0278518199920654, + "learning_rate": 4.046718607809791e-06, + "loss": 1.0549, + "step": 2801 + }, + { + "epoch": 2.1207190160832545, + "grad_norm": 1.9221056699752808, + "learning_rate": 4.040271277793217e-06, + "loss": 1.0776, + "step": 2802 + }, + { + "epoch": 2.1214758751182594, + "grad_norm": 2.296339511871338, + "learning_rate": 4.033827789971474e-06, + "loss": 1.0686, + "step": 2803 + }, + { + "epoch": 2.122232734153264, + "grad_norm": 2.1365742683410645, + "learning_rate": 4.027388148497936e-06, + "loss": 1.0812, + "step": 2804 + }, + { + "epoch": 2.1229895931882687, + "grad_norm": 1.9683605432510376, + "learning_rate": 4.020952357523498e-06, + "loss": 1.0168, + "step": 2805 + }, + { + "epoch": 2.1237464522232736, + "grad_norm": 2.0199337005615234, + "learning_rate": 4.014520421196579e-06, + "loss": 1.1035, + "step": 2806 + }, + { + "epoch": 2.124503311258278, + "grad_norm": 2.0269358158111572, + "learning_rate": 4.008092343663094e-06, + "loss": 1.0973, + "step": 2807 + }, + { + "epoch": 2.125260170293283, + "grad_norm": 2.286689519882202, + "learning_rate": 4.001668129066491e-06, + "loss": 1.0882, + "step": 2808 + }, + { + "epoch": 2.1260170293282874, + "grad_norm": 2.257807731628418, + "learning_rate": 3.995247781547721e-06, + "loss": 1.0877, + "step": 2809 + }, + { + "epoch": 2.1267738883632923, + "grad_norm": 2.049635171890259, + "learning_rate": 3.98883130524524e-06, + "loss": 1.0924, + "step": 2810 + }, + { + "epoch": 2.127530747398297, + "grad_norm": 2.112349033355713, + "learning_rate": 3.982418704295016e-06, + "loss": 1.0931, + "step": 2811 + }, + { + "epoch": 2.1282876064333016, + "grad_norm": 2.0468220710754395, + "learning_rate": 3.9760099828305104e-06, + "loss": 1.0842, + "step": 2812 + }, + { + "epoch": 2.1290444654683065, + "grad_norm": 2.0540926456451416, + "learning_rate": 3.969605144982682e-06, + "loss": 1.0924, + "step": 2813 + }, + { + "epoch": 2.1298013245033114, + "grad_norm": 2.1668741703033447, + "learning_rate": 3.963204194879998e-06, + "loss": 1.1271, + "step": 2814 + }, + { + "epoch": 2.130558183538316, + "grad_norm": 1.9331365823745728, + "learning_rate": 3.956807136648411e-06, + "loss": 1.1066, + "step": 2815 + }, + { + "epoch": 2.1313150425733207, + "grad_norm": 1.9183405637741089, + "learning_rate": 3.950413974411367e-06, + "loss": 1.1018, + "step": 2816 + }, + { + "epoch": 2.1320719016083256, + "grad_norm": 1.9769048690795898, + "learning_rate": 3.944024712289805e-06, + "loss": 1.0565, + "step": 2817 + }, + { + "epoch": 2.13282876064333, + "grad_norm": 1.961674690246582, + "learning_rate": 3.93763935440214e-06, + "loss": 1.0816, + "step": 2818 + }, + { + "epoch": 2.133585619678335, + "grad_norm": 2.0646157264709473, + "learning_rate": 3.931257904864283e-06, + "loss": 1.0373, + "step": 2819 + }, + { + "epoch": 2.13434247871334, + "grad_norm": 2.243910074234009, + "learning_rate": 3.92488036778961e-06, + "loss": 1.0423, + "step": 2820 + }, + { + "epoch": 2.1350993377483443, + "grad_norm": 2.192121744155884, + "learning_rate": 3.91850674728899e-06, + "loss": 1.0402, + "step": 2821 + }, + { + "epoch": 2.135856196783349, + "grad_norm": 2.0529327392578125, + "learning_rate": 3.912137047470764e-06, + "loss": 1.0676, + "step": 2822 + }, + { + "epoch": 2.136613055818354, + "grad_norm": 2.1558024883270264, + "learning_rate": 3.9057712724407366e-06, + "loss": 1.06, + "step": 2823 + }, + { + "epoch": 2.1373699148533585, + "grad_norm": 1.8848477602005005, + "learning_rate": 3.899409426302193e-06, + "loss": 1.0668, + "step": 2824 + }, + { + "epoch": 2.1381267738883634, + "grad_norm": 2.0971271991729736, + "learning_rate": 3.893051513155881e-06, + "loss": 1.1362, + "step": 2825 + }, + { + "epoch": 2.138883632923368, + "grad_norm": 2.3545618057250977, + "learning_rate": 3.88669753710002e-06, + "loss": 1.1014, + "step": 2826 + }, + { + "epoch": 2.1396404919583727, + "grad_norm": 2.0267715454101562, + "learning_rate": 3.880347502230277e-06, + "loss": 1.0955, + "step": 2827 + }, + { + "epoch": 2.1403973509933776, + "grad_norm": 2.065638780593872, + "learning_rate": 3.874001412639796e-06, + "loss": 1.0732, + "step": 2828 + }, + { + "epoch": 2.141154210028382, + "grad_norm": 2.221348285675049, + "learning_rate": 3.867659272419163e-06, + "loss": 1.0891, + "step": 2829 + }, + { + "epoch": 2.141911069063387, + "grad_norm": 2.0352323055267334, + "learning_rate": 3.861321085656425e-06, + "loss": 1.0615, + "step": 2830 + }, + { + "epoch": 2.142667928098392, + "grad_norm": 2.294567584991455, + "learning_rate": 3.854986856437086e-06, + "loss": 1.0886, + "step": 2831 + }, + { + "epoch": 2.1434247871333962, + "grad_norm": 2.132350444793701, + "learning_rate": 3.848656588844089e-06, + "loss": 1.0932, + "step": 2832 + }, + { + "epoch": 2.144181646168401, + "grad_norm": 2.0099170207977295, + "learning_rate": 3.842330286957837e-06, + "loss": 1.1081, + "step": 2833 + }, + { + "epoch": 2.144938505203406, + "grad_norm": 2.181610584259033, + "learning_rate": 3.836007954856154e-06, + "loss": 1.1125, + "step": 2834 + }, + { + "epoch": 2.1456953642384105, + "grad_norm": 2.204340934753418, + "learning_rate": 3.829689596614324e-06, + "loss": 1.1074, + "step": 2835 + }, + { + "epoch": 2.1464522232734153, + "grad_norm": 2.0872256755828857, + "learning_rate": 3.823375216305066e-06, + "loss": 1.0709, + "step": 2836 + }, + { + "epoch": 2.1472090823084202, + "grad_norm": 2.192131757736206, + "learning_rate": 3.8170648179985324e-06, + "loss": 1.0539, + "step": 2837 + }, + { + "epoch": 2.1479659413434247, + "grad_norm": 1.9136378765106201, + "learning_rate": 3.810758405762311e-06, + "loss": 1.0931, + "step": 2838 + }, + { + "epoch": 2.1487228003784296, + "grad_norm": 2.390619993209839, + "learning_rate": 3.8044559836614203e-06, + "loss": 1.0645, + "step": 2839 + }, + { + "epoch": 2.1494796594134344, + "grad_norm": 2.138697862625122, + "learning_rate": 3.798157555758304e-06, + "loss": 1.084, + "step": 2840 + }, + { + "epoch": 2.150236518448439, + "grad_norm": 1.9692342281341553, + "learning_rate": 3.791863126112828e-06, + "loss": 1.0735, + "step": 2841 + }, + { + "epoch": 2.1509933774834438, + "grad_norm": 2.02774715423584, + "learning_rate": 3.78557269878229e-06, + "loss": 1.0853, + "step": 2842 + }, + { + "epoch": 2.151750236518448, + "grad_norm": 1.9480324983596802, + "learning_rate": 3.779286277821402e-06, + "loss": 1.0504, + "step": 2843 + }, + { + "epoch": 2.152507095553453, + "grad_norm": 2.0497875213623047, + "learning_rate": 3.773003867282301e-06, + "loss": 1.0475, + "step": 2844 + }, + { + "epoch": 2.153263954588458, + "grad_norm": 2.0127995014190674, + "learning_rate": 3.766725471214524e-06, + "loss": 1.0773, + "step": 2845 + }, + { + "epoch": 2.1540208136234624, + "grad_norm": 1.920920729637146, + "learning_rate": 3.760451093665034e-06, + "loss": 1.0747, + "step": 2846 + }, + { + "epoch": 2.1547776726584673, + "grad_norm": 1.9610087871551514, + "learning_rate": 3.754180738678201e-06, + "loss": 1.0929, + "step": 2847 + }, + { + "epoch": 2.155534531693472, + "grad_norm": 2.1988742351531982, + "learning_rate": 3.7479144102957955e-06, + "loss": 1.0426, + "step": 2848 + }, + { + "epoch": 2.1562913907284766, + "grad_norm": 2.0719704627990723, + "learning_rate": 3.7416521125569987e-06, + "loss": 1.0965, + "step": 2849 + }, + { + "epoch": 2.1570482497634815, + "grad_norm": 2.073084592819214, + "learning_rate": 3.7353938494983966e-06, + "loss": 1.0428, + "step": 2850 + }, + { + "epoch": 2.1578051087984864, + "grad_norm": 1.9803792238235474, + "learning_rate": 3.729139625153964e-06, + "loss": 1.0724, + "step": 2851 + }, + { + "epoch": 2.158561967833491, + "grad_norm": 1.9936349391937256, + "learning_rate": 3.72288944355508e-06, + "loss": 1.0278, + "step": 2852 + }, + { + "epoch": 2.1593188268684957, + "grad_norm": 2.1690564155578613, + "learning_rate": 3.7166433087305177e-06, + "loss": 1.044, + "step": 2853 + }, + { + "epoch": 2.1600756859035006, + "grad_norm": 2.125483512878418, + "learning_rate": 3.7104012247064436e-06, + "loss": 1.0493, + "step": 2854 + }, + { + "epoch": 2.160832544938505, + "grad_norm": 2.031766653060913, + "learning_rate": 3.7041631955064067e-06, + "loss": 1.0746, + "step": 2855 + }, + { + "epoch": 2.16158940397351, + "grad_norm": 2.1385655403137207, + "learning_rate": 3.697929225151341e-06, + "loss": 1.0993, + "step": 2856 + }, + { + "epoch": 2.162346263008515, + "grad_norm": 2.363760471343994, + "learning_rate": 3.691699317659574e-06, + "loss": 1.0544, + "step": 2857 + }, + { + "epoch": 2.1631031220435193, + "grad_norm": 2.0311970710754395, + "learning_rate": 3.685473477046807e-06, + "loss": 1.0244, + "step": 2858 + }, + { + "epoch": 2.163859981078524, + "grad_norm": 2.2926740646362305, + "learning_rate": 3.679251707326123e-06, + "loss": 1.0813, + "step": 2859 + }, + { + "epoch": 2.164616840113529, + "grad_norm": 2.1094629764556885, + "learning_rate": 3.6730340125079804e-06, + "loss": 1.0729, + "step": 2860 + }, + { + "epoch": 2.1653736991485335, + "grad_norm": 2.2575571537017822, + "learning_rate": 3.6668203966002157e-06, + "loss": 1.1031, + "step": 2861 + }, + { + "epoch": 2.1661305581835384, + "grad_norm": 2.2293882369995117, + "learning_rate": 3.660610863608018e-06, + "loss": 1.0676, + "step": 2862 + }, + { + "epoch": 2.1668874172185433, + "grad_norm": 2.214388132095337, + "learning_rate": 3.6544054175339655e-06, + "loss": 1.1499, + "step": 2863 + }, + { + "epoch": 2.1676442762535477, + "grad_norm": 1.9468921422958374, + "learning_rate": 3.6482040623779925e-06, + "loss": 1.0726, + "step": 2864 + }, + { + "epoch": 2.1684011352885526, + "grad_norm": 2.0682532787323, + "learning_rate": 3.642006802137399e-06, + "loss": 1.0695, + "step": 2865 + }, + { + "epoch": 2.169157994323557, + "grad_norm": 2.1253714561462402, + "learning_rate": 3.6358136408068475e-06, + "loss": 1.116, + "step": 2866 + }, + { + "epoch": 2.169914853358562, + "grad_norm": 2.113579511642456, + "learning_rate": 3.6296245823783514e-06, + "loss": 1.0874, + "step": 2867 + }, + { + "epoch": 2.170671712393567, + "grad_norm": 1.9568238258361816, + "learning_rate": 3.623439630841282e-06, + "loss": 1.0363, + "step": 2868 + }, + { + "epoch": 2.1714285714285713, + "grad_norm": 2.3202977180480957, + "learning_rate": 3.6172587901823652e-06, + "loss": 1.0881, + "step": 2869 + }, + { + "epoch": 2.172185430463576, + "grad_norm": 2.232671022415161, + "learning_rate": 3.611082064385679e-06, + "loss": 1.0919, + "step": 2870 + }, + { + "epoch": 2.172942289498581, + "grad_norm": 1.9573837518692017, + "learning_rate": 3.6049094574326453e-06, + "loss": 1.017, + "step": 2871 + }, + { + "epoch": 2.1736991485335855, + "grad_norm": 2.110637664794922, + "learning_rate": 3.598740973302036e-06, + "loss": 1.1066, + "step": 2872 + }, + { + "epoch": 2.1744560075685904, + "grad_norm": 2.0418527126312256, + "learning_rate": 3.592576615969956e-06, + "loss": 1.0607, + "step": 2873 + }, + { + "epoch": 2.1752128666035953, + "grad_norm": 2.177208662033081, + "learning_rate": 3.5864163894098624e-06, + "loss": 1.0724, + "step": 2874 + }, + { + "epoch": 2.1759697256385997, + "grad_norm": 2.2924139499664307, + "learning_rate": 3.580260297592535e-06, + "loss": 1.0593, + "step": 2875 + }, + { + "epoch": 2.1767265846736046, + "grad_norm": 2.0772855281829834, + "learning_rate": 3.574108344486102e-06, + "loss": 1.0661, + "step": 2876 + }, + { + "epoch": 2.1774834437086095, + "grad_norm": 2.265467405319214, + "learning_rate": 3.5679605340560187e-06, + "loss": 1.0569, + "step": 2877 + }, + { + "epoch": 2.178240302743614, + "grad_norm": 2.154500722885132, + "learning_rate": 3.5618168702650713e-06, + "loss": 1.0332, + "step": 2878 + }, + { + "epoch": 2.178997161778619, + "grad_norm": 2.0559258460998535, + "learning_rate": 3.5556773570733666e-06, + "loss": 1.0697, + "step": 2879 + }, + { + "epoch": 2.1797540208136237, + "grad_norm": 2.4780728816986084, + "learning_rate": 3.5495419984383452e-06, + "loss": 1.0525, + "step": 2880 + }, + { + "epoch": 2.180510879848628, + "grad_norm": 2.0388307571411133, + "learning_rate": 3.543410798314767e-06, + "loss": 1.0224, + "step": 2881 + }, + { + "epoch": 2.181267738883633, + "grad_norm": 2.0700438022613525, + "learning_rate": 3.5372837606547056e-06, + "loss": 1.0795, + "step": 2882 + }, + { + "epoch": 2.1820245979186375, + "grad_norm": 2.08799409866333, + "learning_rate": 3.5311608894075606e-06, + "loss": 1.1147, + "step": 2883 + }, + { + "epoch": 2.1827814569536423, + "grad_norm": 2.04353928565979, + "learning_rate": 3.5250421885200357e-06, + "loss": 1.0693, + "step": 2884 + }, + { + "epoch": 2.1835383159886472, + "grad_norm": 2.1684114933013916, + "learning_rate": 3.5189276619361567e-06, + "loss": 1.0844, + "step": 2885 + }, + { + "epoch": 2.1842951750236517, + "grad_norm": 2.2811787128448486, + "learning_rate": 3.5128173135972515e-06, + "loss": 1.0921, + "step": 2886 + }, + { + "epoch": 2.1850520340586566, + "grad_norm": 2.293611764907837, + "learning_rate": 3.5067111474419603e-06, + "loss": 1.1276, + "step": 2887 + }, + { + "epoch": 2.1858088930936614, + "grad_norm": 1.9369990825653076, + "learning_rate": 3.5006091674062263e-06, + "loss": 1.0811, + "step": 2888 + }, + { + "epoch": 2.186565752128666, + "grad_norm": 2.1612861156463623, + "learning_rate": 3.494511377423291e-06, + "loss": 1.0987, + "step": 2889 + }, + { + "epoch": 2.1873226111636708, + "grad_norm": 2.301436424255371, + "learning_rate": 3.488417781423691e-06, + "loss": 1.1224, + "step": 2890 + }, + { + "epoch": 2.1880794701986757, + "grad_norm": 2.149083375930786, + "learning_rate": 3.482328383335271e-06, + "loss": 1.0906, + "step": 2891 + }, + { + "epoch": 2.18883632923368, + "grad_norm": 2.4687178134918213, + "learning_rate": 3.4762431870831625e-06, + "loss": 1.1381, + "step": 2892 + }, + { + "epoch": 2.189593188268685, + "grad_norm": 2.131269693374634, + "learning_rate": 3.4701621965897906e-06, + "loss": 1.0644, + "step": 2893 + }, + { + "epoch": 2.19035004730369, + "grad_norm": 2.0854032039642334, + "learning_rate": 3.464085415774874e-06, + "loss": 1.0703, + "step": 2894 + }, + { + "epoch": 2.1911069063386943, + "grad_norm": 1.988800287246704, + "learning_rate": 3.458012848555407e-06, + "loss": 1.0925, + "step": 2895 + }, + { + "epoch": 2.191863765373699, + "grad_norm": 2.0683155059814453, + "learning_rate": 3.451944498845673e-06, + "loss": 1.1212, + "step": 2896 + }, + { + "epoch": 2.192620624408704, + "grad_norm": 2.1208488941192627, + "learning_rate": 3.4458803705572385e-06, + "loss": 1.0917, + "step": 2897 + }, + { + "epoch": 2.1933774834437085, + "grad_norm": 1.9864528179168701, + "learning_rate": 3.4398204675989504e-06, + "loss": 1.1095, + "step": 2898 + }, + { + "epoch": 2.1941343424787134, + "grad_norm": 2.0708682537078857, + "learning_rate": 3.4337647938769283e-06, + "loss": 1.0989, + "step": 2899 + }, + { + "epoch": 2.194891201513718, + "grad_norm": 2.229597330093384, + "learning_rate": 3.4277133532945704e-06, + "loss": 1.1137, + "step": 2900 + }, + { + "epoch": 2.1956480605487227, + "grad_norm": 2.039870500564575, + "learning_rate": 3.4216661497525372e-06, + "loss": 1.0866, + "step": 2901 + }, + { + "epoch": 2.1964049195837276, + "grad_norm": 2.037367582321167, + "learning_rate": 3.4156231871487706e-06, + "loss": 1.0947, + "step": 2902 + }, + { + "epoch": 2.197161778618732, + "grad_norm": 2.3312087059020996, + "learning_rate": 3.4095844693784647e-06, + "loss": 1.0883, + "step": 2903 + }, + { + "epoch": 2.197918637653737, + "grad_norm": 2.1165080070495605, + "learning_rate": 3.4035500003340886e-06, + "loss": 1.07, + "step": 2904 + }, + { + "epoch": 2.198675496688742, + "grad_norm": 2.1637613773345947, + "learning_rate": 3.3975197839053727e-06, + "loss": 1.0012, + "step": 2905 + }, + { + "epoch": 2.1994323557237463, + "grad_norm": 2.1280291080474854, + "learning_rate": 3.3914938239792956e-06, + "loss": 1.0525, + "step": 2906 + }, + { + "epoch": 2.200189214758751, + "grad_norm": 2.1883440017700195, + "learning_rate": 3.385472124440102e-06, + "loss": 1.0466, + "step": 2907 + }, + { + "epoch": 2.200946073793756, + "grad_norm": 2.120882272720337, + "learning_rate": 3.3794546891692883e-06, + "loss": 1.0268, + "step": 2908 + }, + { + "epoch": 2.2017029328287605, + "grad_norm": 2.141380786895752, + "learning_rate": 3.3734415220456036e-06, + "loss": 1.0695, + "step": 2909 + }, + { + "epoch": 2.2024597918637654, + "grad_norm": 1.9636356830596924, + "learning_rate": 3.3674326269450386e-06, + "loss": 1.035, + "step": 2910 + }, + { + "epoch": 2.2032166508987703, + "grad_norm": 2.227339506149292, + "learning_rate": 3.361428007740842e-06, + "loss": 1.1143, + "step": 2911 + }, + { + "epoch": 2.2039735099337747, + "grad_norm": 2.206693172454834, + "learning_rate": 3.3554276683034933e-06, + "loss": 1.075, + "step": 2912 + }, + { + "epoch": 2.2047303689687796, + "grad_norm": 2.3205721378326416, + "learning_rate": 3.349431612500721e-06, + "loss": 1.0599, + "step": 2913 + }, + { + "epoch": 2.2054872280037845, + "grad_norm": 2.0222678184509277, + "learning_rate": 3.343439844197493e-06, + "loss": 1.083, + "step": 2914 + }, + { + "epoch": 2.206244087038789, + "grad_norm": 2.076840400695801, + "learning_rate": 3.337452367256012e-06, + "loss": 1.0306, + "step": 2915 + }, + { + "epoch": 2.207000946073794, + "grad_norm": 1.9618786573410034, + "learning_rate": 3.3314691855357197e-06, + "loss": 1.1021, + "step": 2916 + }, + { + "epoch": 2.2077578051087983, + "grad_norm": 2.168519973754883, + "learning_rate": 3.3254903028932716e-06, + "loss": 1.007, + "step": 2917 + }, + { + "epoch": 2.208514664143803, + "grad_norm": 2.0127992630004883, + "learning_rate": 3.3195157231825704e-06, + "loss": 1.0797, + "step": 2918 + }, + { + "epoch": 2.209271523178808, + "grad_norm": 2.0020880699157715, + "learning_rate": 3.3135454502547397e-06, + "loss": 1.1154, + "step": 2919 + }, + { + "epoch": 2.2100283822138125, + "grad_norm": 1.9836198091506958, + "learning_rate": 3.307579487958125e-06, + "loss": 1.0418, + "step": 2920 + }, + { + "epoch": 2.2107852412488174, + "grad_norm": 1.9691238403320312, + "learning_rate": 3.3016178401382957e-06, + "loss": 1.1094, + "step": 2921 + }, + { + "epoch": 2.2115421002838223, + "grad_norm": 2.1438305377960205, + "learning_rate": 3.2956605106380464e-06, + "loss": 1.0935, + "step": 2922 + }, + { + "epoch": 2.2122989593188267, + "grad_norm": 2.1357624530792236, + "learning_rate": 3.2897075032973656e-06, + "loss": 1.1033, + "step": 2923 + }, + { + "epoch": 2.2130558183538316, + "grad_norm": 2.027420997619629, + "learning_rate": 3.28375882195348e-06, + "loss": 1.0502, + "step": 2924 + }, + { + "epoch": 2.2138126773888365, + "grad_norm": 2.073096513748169, + "learning_rate": 3.2778144704408167e-06, + "loss": 1.0565, + "step": 2925 + }, + { + "epoch": 2.214569536423841, + "grad_norm": 2.12164306640625, + "learning_rate": 3.271874452591015e-06, + "loss": 1.103, + "step": 2926 + }, + { + "epoch": 2.215326395458846, + "grad_norm": 2.0933268070220947, + "learning_rate": 3.2659387722329226e-06, + "loss": 1.0776, + "step": 2927 + }, + { + "epoch": 2.2160832544938507, + "grad_norm": 2.033733367919922, + "learning_rate": 3.2600074331925834e-06, + "loss": 1.0642, + "step": 2928 + }, + { + "epoch": 2.216840113528855, + "grad_norm": 1.951857328414917, + "learning_rate": 3.2540804392932527e-06, + "loss": 1.0956, + "step": 2929 + }, + { + "epoch": 2.21759697256386, + "grad_norm": 2.0612125396728516, + "learning_rate": 3.2481577943553766e-06, + "loss": 1.0756, + "step": 2930 + }, + { + "epoch": 2.218353831598865, + "grad_norm": 1.9757081270217896, + "learning_rate": 3.2422395021966006e-06, + "loss": 1.0937, + "step": 2931 + }, + { + "epoch": 2.2191106906338693, + "grad_norm": 1.9480013847351074, + "learning_rate": 3.2363255666317706e-06, + "loss": 1.0986, + "step": 2932 + }, + { + "epoch": 2.2198675496688742, + "grad_norm": 2.087038040161133, + "learning_rate": 3.2304159914729194e-06, + "loss": 1.0518, + "step": 2933 + }, + { + "epoch": 2.2206244087038787, + "grad_norm": 2.119804859161377, + "learning_rate": 3.2245107805292625e-06, + "loss": 1.0356, + "step": 2934 + }, + { + "epoch": 2.2213812677388836, + "grad_norm": 2.2135863304138184, + "learning_rate": 3.2186099376072133e-06, + "loss": 1.0988, + "step": 2935 + }, + { + "epoch": 2.2221381267738884, + "grad_norm": 1.9379045963287354, + "learning_rate": 3.2127134665103684e-06, + "loss": 1.0762, + "step": 2936 + }, + { + "epoch": 2.222894985808893, + "grad_norm": 2.078213691711426, + "learning_rate": 3.206821371039495e-06, + "loss": 1.1031, + "step": 2937 + }, + { + "epoch": 2.2236518448438978, + "grad_norm": 1.7914735078811646, + "learning_rate": 3.2009336549925558e-06, + "loss": 1.0699, + "step": 2938 + }, + { + "epoch": 2.2244087038789027, + "grad_norm": 2.0808887481689453, + "learning_rate": 3.195050322164676e-06, + "loss": 1.0362, + "step": 2939 + }, + { + "epoch": 2.225165562913907, + "grad_norm": 2.2079849243164062, + "learning_rate": 3.1891713763481664e-06, + "loss": 1.0544, + "step": 2940 + }, + { + "epoch": 2.225922421948912, + "grad_norm": 2.137425184249878, + "learning_rate": 3.1832968213325056e-06, + "loss": 1.0465, + "step": 2941 + }, + { + "epoch": 2.226679280983917, + "grad_norm": 2.0021207332611084, + "learning_rate": 3.177426660904339e-06, + "loss": 1.0756, + "step": 2942 + }, + { + "epoch": 2.2274361400189213, + "grad_norm": 2.105543851852417, + "learning_rate": 3.1715608988474904e-06, + "loss": 1.0771, + "step": 2943 + }, + { + "epoch": 2.228192999053926, + "grad_norm": 1.925067663192749, + "learning_rate": 3.1656995389429347e-06, + "loss": 1.0919, + "step": 2944 + }, + { + "epoch": 2.228949858088931, + "grad_norm": 2.076474905014038, + "learning_rate": 3.159842584968813e-06, + "loss": 1.0684, + "step": 2945 + }, + { + "epoch": 2.2297067171239355, + "grad_norm": 2.0177693367004395, + "learning_rate": 3.15399004070043e-06, + "loss": 1.1303, + "step": 2946 + }, + { + "epoch": 2.2304635761589404, + "grad_norm": 1.9716448783874512, + "learning_rate": 3.1481419099102477e-06, + "loss": 1.0174, + "step": 2947 + }, + { + "epoch": 2.2312204351939453, + "grad_norm": 2.0462799072265625, + "learning_rate": 3.1422981963678823e-06, + "loss": 1.0466, + "step": 2948 + }, + { + "epoch": 2.2319772942289497, + "grad_norm": 2.242594003677368, + "learning_rate": 3.1364589038401055e-06, + "loss": 1.0786, + "step": 2949 + }, + { + "epoch": 2.2327341532639546, + "grad_norm": 2.011847972869873, + "learning_rate": 3.1306240360908325e-06, + "loss": 1.0735, + "step": 2950 + }, + { + "epoch": 2.2334910122989595, + "grad_norm": 1.8826643228530884, + "learning_rate": 3.124793596881128e-06, + "loss": 1.105, + "step": 2951 + }, + { + "epoch": 2.234247871333964, + "grad_norm": 2.173900604248047, + "learning_rate": 3.118967589969205e-06, + "loss": 1.0665, + "step": 2952 + }, + { + "epoch": 2.235004730368969, + "grad_norm": 2.065894365310669, + "learning_rate": 3.1131460191104214e-06, + "loss": 1.1092, + "step": 2953 + }, + { + "epoch": 2.2357615894039737, + "grad_norm": 2.1508965492248535, + "learning_rate": 3.107328888057271e-06, + "loss": 1.0692, + "step": 2954 + }, + { + "epoch": 2.236518448438978, + "grad_norm": 2.2163479328155518, + "learning_rate": 3.1015162005593918e-06, + "loss": 1.1466, + "step": 2955 + }, + { + "epoch": 2.237275307473983, + "grad_norm": 2.0771398544311523, + "learning_rate": 3.095707960363548e-06, + "loss": 1.1392, + "step": 2956 + }, + { + "epoch": 2.2380321665089875, + "grad_norm": 2.0793957710266113, + "learning_rate": 3.0899041712136474e-06, + "loss": 1.056, + "step": 2957 + }, + { + "epoch": 2.2387890255439924, + "grad_norm": 2.296447277069092, + "learning_rate": 3.084104836850719e-06, + "loss": 1.0637, + "step": 2958 + }, + { + "epoch": 2.2395458845789973, + "grad_norm": 2.2935092449188232, + "learning_rate": 3.0783099610129273e-06, + "loss": 1.0821, + "step": 2959 + }, + { + "epoch": 2.2403027436140017, + "grad_norm": 2.17787766456604, + "learning_rate": 3.0725195474355648e-06, + "loss": 1.0933, + "step": 2960 + }, + { + "epoch": 2.2410596026490066, + "grad_norm": 2.0003387928009033, + "learning_rate": 3.066733599851038e-06, + "loss": 1.0357, + "step": 2961 + }, + { + "epoch": 2.2418164616840115, + "grad_norm": 2.4547882080078125, + "learning_rate": 3.060952121988881e-06, + "loss": 1.1288, + "step": 2962 + }, + { + "epoch": 2.242573320719016, + "grad_norm": 2.319946765899658, + "learning_rate": 3.055175117575754e-06, + "loss": 1.09, + "step": 2963 + }, + { + "epoch": 2.243330179754021, + "grad_norm": 2.2627015113830566, + "learning_rate": 3.049402590335415e-06, + "loss": 1.1238, + "step": 2964 + }, + { + "epoch": 2.2440870387890257, + "grad_norm": 2.032540798187256, + "learning_rate": 3.043634543988752e-06, + "loss": 1.0622, + "step": 2965 + }, + { + "epoch": 2.24484389782403, + "grad_norm": 2.041095495223999, + "learning_rate": 3.037870982253763e-06, + "loss": 1.0729, + "step": 2966 + }, + { + "epoch": 2.245600756859035, + "grad_norm": 2.079834222793579, + "learning_rate": 3.032111908845547e-06, + "loss": 1.0541, + "step": 2967 + }, + { + "epoch": 2.24635761589404, + "grad_norm": 1.9280726909637451, + "learning_rate": 3.0263573274763165e-06, + "loss": 1.0715, + "step": 2968 + }, + { + "epoch": 2.2471144749290444, + "grad_norm": 1.9036996364593506, + "learning_rate": 3.0206072418553854e-06, + "loss": 1.0914, + "step": 2969 + }, + { + "epoch": 2.2478713339640493, + "grad_norm": 1.9973169565200806, + "learning_rate": 3.0148616556891774e-06, + "loss": 1.0612, + "step": 2970 + }, + { + "epoch": 2.248628192999054, + "grad_norm": 2.048168420791626, + "learning_rate": 3.009120572681206e-06, + "loss": 1.0573, + "step": 2971 + }, + { + "epoch": 2.2493850520340586, + "grad_norm": 1.9385312795639038, + "learning_rate": 3.0033839965320797e-06, + "loss": 1.0503, + "step": 2972 + }, + { + "epoch": 2.2501419110690635, + "grad_norm": 1.8759933710098267, + "learning_rate": 2.9976519309395154e-06, + "loss": 1.0739, + "step": 2973 + }, + { + "epoch": 2.250898770104068, + "grad_norm": 2.2850966453552246, + "learning_rate": 2.9919243795983116e-06, + "loss": 1.0669, + "step": 2974 + }, + { + "epoch": 2.251655629139073, + "grad_norm": 2.017787218093872, + "learning_rate": 2.9862013462003634e-06, + "loss": 1.0993, + "step": 2975 + }, + { + "epoch": 2.2524124881740777, + "grad_norm": 1.9540081024169922, + "learning_rate": 2.980482834434648e-06, + "loss": 1.116, + "step": 2976 + }, + { + "epoch": 2.253169347209082, + "grad_norm": 2.056605577468872, + "learning_rate": 2.974768847987239e-06, + "loss": 1.0612, + "step": 2977 + }, + { + "epoch": 2.253926206244087, + "grad_norm": 2.3890209197998047, + "learning_rate": 2.969059390541273e-06, + "loss": 1.0817, + "step": 2978 + }, + { + "epoch": 2.254683065279092, + "grad_norm": 1.947478175163269, + "learning_rate": 2.963354465776983e-06, + "loss": 1.0236, + "step": 2979 + }, + { + "epoch": 2.2554399243140963, + "grad_norm": 2.1459438800811768, + "learning_rate": 2.9576540773716783e-06, + "loss": 1.0725, + "step": 2980 + }, + { + "epoch": 2.2561967833491012, + "grad_norm": 2.0048837661743164, + "learning_rate": 2.9519582289997423e-06, + "loss": 1.1, + "step": 2981 + }, + { + "epoch": 2.256953642384106, + "grad_norm": 2.1805686950683594, + "learning_rate": 2.9462669243326357e-06, + "loss": 1.0963, + "step": 2982 + }, + { + "epoch": 2.2577105014191106, + "grad_norm": 1.9871695041656494, + "learning_rate": 2.9405801670388784e-06, + "loss": 1.1184, + "step": 2983 + }, + { + "epoch": 2.2584673604541154, + "grad_norm": 2.142199754714966, + "learning_rate": 2.934897960784075e-06, + "loss": 1.0546, + "step": 2984 + }, + { + "epoch": 2.2592242194891203, + "grad_norm": 2.1177968978881836, + "learning_rate": 2.9292203092308823e-06, + "loss": 1.1136, + "step": 2985 + }, + { + "epoch": 2.2599810785241248, + "grad_norm": 2.1006743907928467, + "learning_rate": 2.9235472160390315e-06, + "loss": 1.0091, + "step": 2986 + }, + { + "epoch": 2.2607379375591297, + "grad_norm": 1.978402853012085, + "learning_rate": 2.917878684865312e-06, + "loss": 1.0677, + "step": 2987 + }, + { + "epoch": 2.2614947965941345, + "grad_norm": 2.2147839069366455, + "learning_rate": 2.9122147193635757e-06, + "loss": 1.0809, + "step": 2988 + }, + { + "epoch": 2.262251655629139, + "grad_norm": 2.2743515968322754, + "learning_rate": 2.9065553231847215e-06, + "loss": 1.0647, + "step": 2989 + }, + { + "epoch": 2.263008514664144, + "grad_norm": 2.038224935531616, + "learning_rate": 2.900900499976714e-06, + "loss": 1.0953, + "step": 2990 + }, + { + "epoch": 2.2637653736991483, + "grad_norm": 1.9364351034164429, + "learning_rate": 2.895250253384567e-06, + "loss": 1.0241, + "step": 2991 + }, + { + "epoch": 2.264522232734153, + "grad_norm": 2.0556015968322754, + "learning_rate": 2.8896045870503405e-06, + "loss": 1.0358, + "step": 2992 + }, + { + "epoch": 2.265279091769158, + "grad_norm": 2.6211061477661133, + "learning_rate": 2.8839635046131477e-06, + "loss": 1.058, + "step": 2993 + }, + { + "epoch": 2.2660359508041625, + "grad_norm": 2.0403685569763184, + "learning_rate": 2.87832700970914e-06, + "loss": 1.0957, + "step": 2994 + }, + { + "epoch": 2.2667928098391674, + "grad_norm": 2.106076955795288, + "learning_rate": 2.8726951059715184e-06, + "loss": 1.0622, + "step": 2995 + }, + { + "epoch": 2.2675496688741723, + "grad_norm": 1.878516674041748, + "learning_rate": 2.867067797030522e-06, + "loss": 1.0636, + "step": 2996 + }, + { + "epoch": 2.2683065279091768, + "grad_norm": 2.178928852081299, + "learning_rate": 2.861445086513431e-06, + "loss": 1.0347, + "step": 2997 + }, + { + "epoch": 2.2690633869441816, + "grad_norm": 2.5624477863311768, + "learning_rate": 2.855826978044558e-06, + "loss": 1.0171, + "step": 2998 + }, + { + "epoch": 2.2698202459791865, + "grad_norm": 1.9493463039398193, + "learning_rate": 2.8502134752452488e-06, + "loss": 1.0763, + "step": 2999 + }, + { + "epoch": 2.270577105014191, + "grad_norm": 1.9162508249282837, + "learning_rate": 2.844604581733879e-06, + "loss": 1.1071, + "step": 3000 + }, + { + "epoch": 2.271333964049196, + "grad_norm": 2.097134828567505, + "learning_rate": 2.8390003011258576e-06, + "loss": 1.0987, + "step": 3001 + }, + { + "epoch": 2.2720908230842007, + "grad_norm": 2.138456106185913, + "learning_rate": 2.83340063703362e-06, + "loss": 1.0962, + "step": 3002 + }, + { + "epoch": 2.272847682119205, + "grad_norm": 2.057185411453247, + "learning_rate": 2.8278055930666243e-06, + "loss": 1.0849, + "step": 3003 + }, + { + "epoch": 2.27360454115421, + "grad_norm": 2.094721555709839, + "learning_rate": 2.822215172831354e-06, + "loss": 1.1004, + "step": 3004 + }, + { + "epoch": 2.274361400189215, + "grad_norm": 2.1358296871185303, + "learning_rate": 2.8166293799312994e-06, + "loss": 1.0583, + "step": 3005 + }, + { + "epoch": 2.2751182592242194, + "grad_norm": 1.955474615097046, + "learning_rate": 2.8110482179669823e-06, + "loss": 1.0828, + "step": 3006 + }, + { + "epoch": 2.2758751182592243, + "grad_norm": 2.084411144256592, + "learning_rate": 2.805471690535935e-06, + "loss": 1.0635, + "step": 3007 + }, + { + "epoch": 2.2766319772942287, + "grad_norm": 2.111748218536377, + "learning_rate": 2.799899801232702e-06, + "loss": 1.0604, + "step": 3008 + }, + { + "epoch": 2.2773888363292336, + "grad_norm": 2.2352616786956787, + "learning_rate": 2.7943325536488373e-06, + "loss": 1.1397, + "step": 3009 + }, + { + "epoch": 2.2781456953642385, + "grad_norm": 2.1407878398895264, + "learning_rate": 2.788769951372908e-06, + "loss": 1.083, + "step": 3010 + }, + { + "epoch": 2.2789025543992434, + "grad_norm": 2.0809216499328613, + "learning_rate": 2.7832119979904798e-06, + "loss": 1.0496, + "step": 3011 + }, + { + "epoch": 2.279659413434248, + "grad_norm": 2.0093045234680176, + "learning_rate": 2.77765869708412e-06, + "loss": 1.0543, + "step": 3012 + }, + { + "epoch": 2.2804162724692527, + "grad_norm": 1.9954379796981812, + "learning_rate": 2.7721100522334056e-06, + "loss": 1.1095, + "step": 3013 + }, + { + "epoch": 2.281173131504257, + "grad_norm": 2.1180033683776855, + "learning_rate": 2.7665660670149092e-06, + "loss": 1.0982, + "step": 3014 + }, + { + "epoch": 2.281929990539262, + "grad_norm": 2.0234453678131104, + "learning_rate": 2.761026745002201e-06, + "loss": 1.0808, + "step": 3015 + }, + { + "epoch": 2.282686849574267, + "grad_norm": 2.1585819721221924, + "learning_rate": 2.7554920897658386e-06, + "loss": 1.0775, + "step": 3016 + }, + { + "epoch": 2.2834437086092714, + "grad_norm": 1.9864180088043213, + "learning_rate": 2.7499621048733775e-06, + "loss": 1.1177, + "step": 3017 + }, + { + "epoch": 2.2842005676442763, + "grad_norm": 2.359938621520996, + "learning_rate": 2.744436793889368e-06, + "loss": 1.0951, + "step": 3018 + }, + { + "epoch": 2.284957426679281, + "grad_norm": 2.0253729820251465, + "learning_rate": 2.7389161603753312e-06, + "loss": 1.0414, + "step": 3019 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 1.9887871742248535, + "learning_rate": 2.73340020788979e-06, + "loss": 1.0882, + "step": 3020 + }, + { + "epoch": 2.2864711447492905, + "grad_norm": 1.971255898475647, + "learning_rate": 2.7278889399882435e-06, + "loss": 1.0867, + "step": 3021 + }, + { + "epoch": 2.2872280037842954, + "grad_norm": 2.0510776042938232, + "learning_rate": 2.7223823602231664e-06, + "loss": 1.0738, + "step": 3022 + }, + { + "epoch": 2.2879848628193, + "grad_norm": 1.953727126121521, + "learning_rate": 2.7168804721440177e-06, + "loss": 1.0262, + "step": 3023 + }, + { + "epoch": 2.2887417218543047, + "grad_norm": 2.0659303665161133, + "learning_rate": 2.7113832792972323e-06, + "loss": 1.0748, + "step": 3024 + }, + { + "epoch": 2.289498580889309, + "grad_norm": 2.147465229034424, + "learning_rate": 2.705890785226219e-06, + "loss": 1.0691, + "step": 3025 + }, + { + "epoch": 2.290255439924314, + "grad_norm": 2.1003715991973877, + "learning_rate": 2.7004029934713516e-06, + "loss": 1.0623, + "step": 3026 + }, + { + "epoch": 2.291012298959319, + "grad_norm": 1.9521448612213135, + "learning_rate": 2.6949199075699754e-06, + "loss": 1.1025, + "step": 3027 + }, + { + "epoch": 2.291769157994324, + "grad_norm": 1.998204231262207, + "learning_rate": 2.689441531056408e-06, + "loss": 1.0875, + "step": 3028 + }, + { + "epoch": 2.2925260170293282, + "grad_norm": 1.98150634765625, + "learning_rate": 2.683967867461925e-06, + "loss": 1.1051, + "step": 3029 + }, + { + "epoch": 2.293282876064333, + "grad_norm": 2.0418317317962646, + "learning_rate": 2.678498920314767e-06, + "loss": 1.0871, + "step": 3030 + }, + { + "epoch": 2.2940397350993376, + "grad_norm": 2.075697898864746, + "learning_rate": 2.673034693140136e-06, + "loss": 1.1588, + "step": 3031 + }, + { + "epoch": 2.2947965941343424, + "grad_norm": 2.049619197845459, + "learning_rate": 2.6675751894601928e-06, + "loss": 1.1, + "step": 3032 + }, + { + "epoch": 2.2955534531693473, + "grad_norm": 1.9731786251068115, + "learning_rate": 2.6621204127940403e-06, + "loss": 1.0883, + "step": 3033 + }, + { + "epoch": 2.2963103122043518, + "grad_norm": 2.0121383666992188, + "learning_rate": 2.656670366657748e-06, + "loss": 1.0914, + "step": 3034 + }, + { + "epoch": 2.2970671712393567, + "grad_norm": 2.0904853343963623, + "learning_rate": 2.651225054564334e-06, + "loss": 1.0803, + "step": 3035 + }, + { + "epoch": 2.2978240302743616, + "grad_norm": 2.0923306941986084, + "learning_rate": 2.645784480023764e-06, + "loss": 1.0528, + "step": 3036 + }, + { + "epoch": 2.298580889309366, + "grad_norm": 2.1617391109466553, + "learning_rate": 2.6403486465429524e-06, + "loss": 1.1454, + "step": 3037 + }, + { + "epoch": 2.299337748344371, + "grad_norm": 1.9403904676437378, + "learning_rate": 2.634917557625747e-06, + "loss": 1.1051, + "step": 3038 + }, + { + "epoch": 2.3000946073793758, + "grad_norm": 2.3704395294189453, + "learning_rate": 2.629491216772951e-06, + "loss": 1.0736, + "step": 3039 + }, + { + "epoch": 2.30085146641438, + "grad_norm": 2.0632617473602295, + "learning_rate": 2.6240696274822976e-06, + "loss": 1.0948, + "step": 3040 + }, + { + "epoch": 2.301608325449385, + "grad_norm": 1.9306870698928833, + "learning_rate": 2.6186527932484595e-06, + "loss": 1.0978, + "step": 3041 + }, + { + "epoch": 2.3023651844843895, + "grad_norm": 2.1989099979400635, + "learning_rate": 2.61324071756305e-06, + "loss": 1.0513, + "step": 3042 + }, + { + "epoch": 2.3031220435193944, + "grad_norm": 2.053589105606079, + "learning_rate": 2.60783340391461e-06, + "loss": 1.0828, + "step": 3043 + }, + { + "epoch": 2.3038789025543993, + "grad_norm": 2.009385347366333, + "learning_rate": 2.602430855788607e-06, + "loss": 1.0859, + "step": 3044 + }, + { + "epoch": 2.304635761589404, + "grad_norm": 2.086993932723999, + "learning_rate": 2.597033076667443e-06, + "loss": 1.1311, + "step": 3045 + }, + { + "epoch": 2.3053926206244086, + "grad_norm": 2.1662371158599854, + "learning_rate": 2.5916400700304476e-06, + "loss": 1.0746, + "step": 3046 + }, + { + "epoch": 2.3061494796594135, + "grad_norm": 1.9955482482910156, + "learning_rate": 2.5862518393538662e-06, + "loss": 1.0841, + "step": 3047 + }, + { + "epoch": 2.306906338694418, + "grad_norm": 2.2083284854888916, + "learning_rate": 2.5808683881108743e-06, + "loss": 1.0738, + "step": 3048 + }, + { + "epoch": 2.307663197729423, + "grad_norm": 2.1207940578460693, + "learning_rate": 2.5754897197715566e-06, + "loss": 1.1198, + "step": 3049 + }, + { + "epoch": 2.3084200567644277, + "grad_norm": 2.125546455383301, + "learning_rate": 2.5701158378029245e-06, + "loss": 1.0487, + "step": 3050 + }, + { + "epoch": 2.309176915799432, + "grad_norm": 2.025674343109131, + "learning_rate": 2.564746745668899e-06, + "loss": 1.0782, + "step": 3051 + }, + { + "epoch": 2.309933774834437, + "grad_norm": 2.1108992099761963, + "learning_rate": 2.559382446830318e-06, + "loss": 1.1041, + "step": 3052 + }, + { + "epoch": 2.310690633869442, + "grad_norm": 2.149214267730713, + "learning_rate": 2.554022944744925e-06, + "loss": 1.0332, + "step": 3053 + }, + { + "epoch": 2.3114474929044464, + "grad_norm": 2.708857536315918, + "learning_rate": 2.5486682428673753e-06, + "loss": 1.0605, + "step": 3054 + }, + { + "epoch": 2.3122043519394513, + "grad_norm": 2.029184341430664, + "learning_rate": 2.5433183446492214e-06, + "loss": 1.1014, + "step": 3055 + }, + { + "epoch": 2.312961210974456, + "grad_norm": 1.9624137878417969, + "learning_rate": 2.537973253538931e-06, + "loss": 1.0924, + "step": 3056 + }, + { + "epoch": 2.3137180700094606, + "grad_norm": 2.015695571899414, + "learning_rate": 2.5326329729818673e-06, + "loss": 1.0847, + "step": 3057 + }, + { + "epoch": 2.3144749290444655, + "grad_norm": 2.0661072731018066, + "learning_rate": 2.5272975064202943e-06, + "loss": 1.0434, + "step": 3058 + }, + { + "epoch": 2.3152317880794704, + "grad_norm": 2.1128456592559814, + "learning_rate": 2.521966857293378e-06, + "loss": 1.0661, + "step": 3059 + }, + { + "epoch": 2.315988647114475, + "grad_norm": 2.1260507106781006, + "learning_rate": 2.5166410290371626e-06, + "loss": 1.084, + "step": 3060 + }, + { + "epoch": 2.3167455061494797, + "grad_norm": 2.091785430908203, + "learning_rate": 2.5113200250845996e-06, + "loss": 1.0772, + "step": 3061 + }, + { + "epoch": 2.3175023651844846, + "grad_norm": 2.1781039237976074, + "learning_rate": 2.5060038488655302e-06, + "loss": 1.0469, + "step": 3062 + }, + { + "epoch": 2.318259224219489, + "grad_norm": 2.150576114654541, + "learning_rate": 2.500692503806678e-06, + "loss": 1.1243, + "step": 3063 + }, + { + "epoch": 2.319016083254494, + "grad_norm": 2.021026372909546, + "learning_rate": 2.4953859933316555e-06, + "loss": 1.0894, + "step": 3064 + }, + { + "epoch": 2.3197729422894984, + "grad_norm": 2.0633111000061035, + "learning_rate": 2.490084320860961e-06, + "loss": 1.07, + "step": 3065 + }, + { + "epoch": 2.3205298013245033, + "grad_norm": 2.0326859951019287, + "learning_rate": 2.4847874898119706e-06, + "loss": 1.1148, + "step": 3066 + }, + { + "epoch": 2.321286660359508, + "grad_norm": 2.0705957412719727, + "learning_rate": 2.479495503598935e-06, + "loss": 1.056, + "step": 3067 + }, + { + "epoch": 2.3220435193945126, + "grad_norm": 2.0693790912628174, + "learning_rate": 2.474208365632993e-06, + "loss": 1.0678, + "step": 3068 + }, + { + "epoch": 2.3228003784295175, + "grad_norm": 2.152256965637207, + "learning_rate": 2.468926079322153e-06, + "loss": 1.0826, + "step": 3069 + }, + { + "epoch": 2.3235572374645224, + "grad_norm": 1.9246243238449097, + "learning_rate": 2.463648648071298e-06, + "loss": 1.0872, + "step": 3070 + }, + { + "epoch": 2.324314096499527, + "grad_norm": 2.0952563285827637, + "learning_rate": 2.4583760752821752e-06, + "loss": 1.0851, + "step": 3071 + }, + { + "epoch": 2.3250709555345317, + "grad_norm": 2.1515979766845703, + "learning_rate": 2.453108364353406e-06, + "loss": 1.0494, + "step": 3072 + }, + { + "epoch": 2.3258278145695366, + "grad_norm": 2.0528197288513184, + "learning_rate": 2.447845518680481e-06, + "loss": 1.0619, + "step": 3073 + }, + { + "epoch": 2.326584673604541, + "grad_norm": 2.0087759494781494, + "learning_rate": 2.4425875416557426e-06, + "loss": 1.0229, + "step": 3074 + }, + { + "epoch": 2.327341532639546, + "grad_norm": 2.0219340324401855, + "learning_rate": 2.437334436668407e-06, + "loss": 1.0621, + "step": 3075 + }, + { + "epoch": 2.328098391674551, + "grad_norm": 2.0388236045837402, + "learning_rate": 2.432086207104549e-06, + "loss": 1.111, + "step": 3076 + }, + { + "epoch": 2.3288552507095552, + "grad_norm": 2.142197370529175, + "learning_rate": 2.426842856347089e-06, + "loss": 1.0878, + "step": 3077 + }, + { + "epoch": 2.32961210974456, + "grad_norm": 1.8765891790390015, + "learning_rate": 2.4216043877758163e-06, + "loss": 1.1534, + "step": 3078 + }, + { + "epoch": 2.330368968779565, + "grad_norm": 2.0405936241149902, + "learning_rate": 2.416370804767367e-06, + "loss": 1.0497, + "step": 3079 + }, + { + "epoch": 2.3311258278145695, + "grad_norm": 1.989956259727478, + "learning_rate": 2.4111421106952317e-06, + "loss": 1.0716, + "step": 3080 + }, + { + "epoch": 2.3318826868495743, + "grad_norm": 2.097310781478882, + "learning_rate": 2.4059183089297432e-06, + "loss": 1.0932, + "step": 3081 + }, + { + "epoch": 2.332639545884579, + "grad_norm": 1.9613529443740845, + "learning_rate": 2.4006994028380835e-06, + "loss": 1.0853, + "step": 3082 + }, + { + "epoch": 2.3333964049195837, + "grad_norm": 2.1626081466674805, + "learning_rate": 2.3954853957842816e-06, + "loss": 1.1113, + "step": 3083 + }, + { + "epoch": 2.3341532639545886, + "grad_norm": 1.9847509860992432, + "learning_rate": 2.3902762911292063e-06, + "loss": 1.0632, + "step": 3084 + }, + { + "epoch": 2.334910122989593, + "grad_norm": 2.432072639465332, + "learning_rate": 2.385072092230568e-06, + "loss": 1.0981, + "step": 3085 + }, + { + "epoch": 2.335666982024598, + "grad_norm": 2.0696966648101807, + "learning_rate": 2.3798728024429136e-06, + "loss": 1.053, + "step": 3086 + }, + { + "epoch": 2.3364238410596028, + "grad_norm": 2.017995595932007, + "learning_rate": 2.374678425117631e-06, + "loss": 1.1, + "step": 3087 + }, + { + "epoch": 2.337180700094607, + "grad_norm": 2.056910753250122, + "learning_rate": 2.369488963602927e-06, + "loss": 1.1009, + "step": 3088 + }, + { + "epoch": 2.337937559129612, + "grad_norm": 2.1294288635253906, + "learning_rate": 2.3643044212438547e-06, + "loss": 1.1008, + "step": 3089 + }, + { + "epoch": 2.338694418164617, + "grad_norm": 1.9634032249450684, + "learning_rate": 2.3591248013822885e-06, + "loss": 1.0868, + "step": 3090 + }, + { + "epoch": 2.3394512771996214, + "grad_norm": 2.2118031978607178, + "learning_rate": 2.3539501073569357e-06, + "loss": 1.1081, + "step": 3091 + }, + { + "epoch": 2.3402081362346263, + "grad_norm": 2.3364391326904297, + "learning_rate": 2.348780342503326e-06, + "loss": 1.0373, + "step": 3092 + }, + { + "epoch": 2.340964995269631, + "grad_norm": 2.061373472213745, + "learning_rate": 2.343615510153806e-06, + "loss": 1.0809, + "step": 3093 + }, + { + "epoch": 2.3417218543046356, + "grad_norm": 1.9650219678878784, + "learning_rate": 2.338455613637553e-06, + "loss": 1.0659, + "step": 3094 + }, + { + "epoch": 2.3424787133396405, + "grad_norm": 2.1445631980895996, + "learning_rate": 2.333300656280552e-06, + "loss": 1.0708, + "step": 3095 + }, + { + "epoch": 2.3432355723746454, + "grad_norm": 2.209373950958252, + "learning_rate": 2.328150641405614e-06, + "loss": 1.0744, + "step": 3096 + }, + { + "epoch": 2.34399243140965, + "grad_norm": 2.036855936050415, + "learning_rate": 2.3230055723323587e-06, + "loss": 1.0878, + "step": 3097 + }, + { + "epoch": 2.3447492904446547, + "grad_norm": 2.2812464237213135, + "learning_rate": 2.317865452377222e-06, + "loss": 1.0321, + "step": 3098 + }, + { + "epoch": 2.345506149479659, + "grad_norm": 1.9373234510421753, + "learning_rate": 2.312730284853442e-06, + "loss": 1.1768, + "step": 3099 + }, + { + "epoch": 2.346263008514664, + "grad_norm": 1.9641289710998535, + "learning_rate": 2.3076000730710715e-06, + "loss": 1.0922, + "step": 3100 + }, + { + "epoch": 2.347019867549669, + "grad_norm": 2.2891197204589844, + "learning_rate": 2.3024748203369697e-06, + "loss": 1.0122, + "step": 3101 + }, + { + "epoch": 2.347776726584674, + "grad_norm": 2.042477607727051, + "learning_rate": 2.2973545299547907e-06, + "loss": 1.0209, + "step": 3102 + }, + { + "epoch": 2.3485335856196783, + "grad_norm": 2.082688331604004, + "learning_rate": 2.2922392052250012e-06, + "loss": 1.0879, + "step": 3103 + }, + { + "epoch": 2.349290444654683, + "grad_norm": 2.036217451095581, + "learning_rate": 2.287128849444857e-06, + "loss": 1.0736, + "step": 3104 + }, + { + "epoch": 2.3500473036896876, + "grad_norm": 2.099870443344116, + "learning_rate": 2.282023465908417e-06, + "loss": 1.0502, + "step": 3105 + }, + { + "epoch": 2.3508041627246925, + "grad_norm": 2.0198404788970947, + "learning_rate": 2.276923057906534e-06, + "loss": 1.1233, + "step": 3106 + }, + { + "epoch": 2.3515610217596974, + "grad_norm": 2.331169843673706, + "learning_rate": 2.271827628726853e-06, + "loss": 1.0535, + "step": 3107 + }, + { + "epoch": 2.352317880794702, + "grad_norm": 2.124520778656006, + "learning_rate": 2.2667371816538124e-06, + "loss": 1.0393, + "step": 3108 + }, + { + "epoch": 2.3530747398297067, + "grad_norm": 1.9916775226593018, + "learning_rate": 2.261651719968635e-06, + "loss": 1.0811, + "step": 3109 + }, + { + "epoch": 2.3538315988647116, + "grad_norm": 2.134824514389038, + "learning_rate": 2.2565712469493285e-06, + "loss": 1.0755, + "step": 3110 + }, + { + "epoch": 2.354588457899716, + "grad_norm": 2.083906412124634, + "learning_rate": 2.251495765870691e-06, + "loss": 1.0626, + "step": 3111 + }, + { + "epoch": 2.355345316934721, + "grad_norm": 2.06776762008667, + "learning_rate": 2.246425280004301e-06, + "loss": 1.0667, + "step": 3112 + }, + { + "epoch": 2.356102175969726, + "grad_norm": 2.042806386947632, + "learning_rate": 2.241359792618514e-06, + "loss": 1.0589, + "step": 3113 + }, + { + "epoch": 2.3568590350047303, + "grad_norm": 2.0756242275238037, + "learning_rate": 2.2362993069784754e-06, + "loss": 1.0757, + "step": 3114 + }, + { + "epoch": 2.357615894039735, + "grad_norm": 2.315819501876831, + "learning_rate": 2.231243826346082e-06, + "loss": 1.1302, + "step": 3115 + }, + { + "epoch": 2.3583727530747396, + "grad_norm": 2.038795232772827, + "learning_rate": 2.2261933539800276e-06, + "loss": 0.9891, + "step": 3116 + }, + { + "epoch": 2.3591296121097445, + "grad_norm": 1.8599226474761963, + "learning_rate": 2.2211478931357686e-06, + "loss": 1.0786, + "step": 3117 + }, + { + "epoch": 2.3598864711447494, + "grad_norm": 2.074420928955078, + "learning_rate": 2.2161074470655327e-06, + "loss": 1.0555, + "step": 3118 + }, + { + "epoch": 2.3606433301797543, + "grad_norm": 2.0066890716552734, + "learning_rate": 2.2110720190183143e-06, + "loss": 1.077, + "step": 3119 + }, + { + "epoch": 2.3614001892147587, + "grad_norm": 2.137488603591919, + "learning_rate": 2.2060416122398754e-06, + "loss": 1.0421, + "step": 3120 + }, + { + "epoch": 2.3621570482497636, + "grad_norm": 2.0553901195526123, + "learning_rate": 2.2010162299727382e-06, + "loss": 1.0978, + "step": 3121 + }, + { + "epoch": 2.362913907284768, + "grad_norm": 1.9921813011169434, + "learning_rate": 2.1959958754561846e-06, + "loss": 1.0452, + "step": 3122 + }, + { + "epoch": 2.363670766319773, + "grad_norm": 2.0219523906707764, + "learning_rate": 2.1909805519262607e-06, + "loss": 1.0474, + "step": 3123 + }, + { + "epoch": 2.364427625354778, + "grad_norm": 1.9772000312805176, + "learning_rate": 2.185970262615767e-06, + "loss": 1.0281, + "step": 3124 + }, + { + "epoch": 2.3651844843897822, + "grad_norm": 2.09308123588562, + "learning_rate": 2.1809650107542632e-06, + "loss": 1.082, + "step": 3125 + }, + { + "epoch": 2.365941343424787, + "grad_norm": 1.9955263137817383, + "learning_rate": 2.175964799568052e-06, + "loss": 1.0324, + "step": 3126 + }, + { + "epoch": 2.366698202459792, + "grad_norm": 2.1914641857147217, + "learning_rate": 2.1709696322801972e-06, + "loss": 1.093, + "step": 3127 + }, + { + "epoch": 2.3674550614947965, + "grad_norm": 1.9175313711166382, + "learning_rate": 2.1659795121105097e-06, + "loss": 1.0671, + "step": 3128 + }, + { + "epoch": 2.3682119205298013, + "grad_norm": 2.153555154800415, + "learning_rate": 2.16099444227554e-06, + "loss": 1.0857, + "step": 3129 + }, + { + "epoch": 2.3689687795648062, + "grad_norm": 2.1872153282165527, + "learning_rate": 2.1560144259885886e-06, + "loss": 1.0465, + "step": 3130 + }, + { + "epoch": 2.3697256385998107, + "grad_norm": 2.0605931282043457, + "learning_rate": 2.151039466459703e-06, + "loss": 1.0579, + "step": 3131 + }, + { + "epoch": 2.3704824976348156, + "grad_norm": 2.1121630668640137, + "learning_rate": 2.1460695668956603e-06, + "loss": 1.0797, + "step": 3132 + }, + { + "epoch": 2.37123935666982, + "grad_norm": 1.927918553352356, + "learning_rate": 2.1411047304999855e-06, + "loss": 1.065, + "step": 3133 + }, + { + "epoch": 2.371996215704825, + "grad_norm": 2.0213940143585205, + "learning_rate": 2.1361449604729334e-06, + "loss": 1.0243, + "step": 3134 + }, + { + "epoch": 2.3727530747398298, + "grad_norm": 2.0634241104125977, + "learning_rate": 2.1311902600115026e-06, + "loss": 1.0243, + "step": 3135 + }, + { + "epoch": 2.3735099337748347, + "grad_norm": 2.063898801803589, + "learning_rate": 2.126240632309412e-06, + "loss": 1.0804, + "step": 3136 + }, + { + "epoch": 2.374266792809839, + "grad_norm": 1.9965025186538696, + "learning_rate": 2.1212960805571153e-06, + "loss": 1.0819, + "step": 3137 + }, + { + "epoch": 2.375023651844844, + "grad_norm": 2.0790200233459473, + "learning_rate": 2.1163566079417965e-06, + "loss": 1.0549, + "step": 3138 + }, + { + "epoch": 2.3757805108798484, + "grad_norm": 2.2082910537719727, + "learning_rate": 2.1114222176473647e-06, + "loss": 1.041, + "step": 3139 + }, + { + "epoch": 2.3765373699148533, + "grad_norm": 2.1791975498199463, + "learning_rate": 2.1064929128544527e-06, + "loss": 1.0671, + "step": 3140 + }, + { + "epoch": 2.377294228949858, + "grad_norm": 2.011662006378174, + "learning_rate": 2.1015686967404155e-06, + "loss": 1.0451, + "step": 3141 + }, + { + "epoch": 2.3780510879848626, + "grad_norm": 2.092410087585449, + "learning_rate": 2.0966495724793328e-06, + "loss": 0.999, + "step": 3142 + }, + { + "epoch": 2.3788079470198675, + "grad_norm": 2.0836849212646484, + "learning_rate": 2.0917355432419856e-06, + "loss": 1.0693, + "step": 3143 + }, + { + "epoch": 2.3795648060548724, + "grad_norm": 2.1676831245422363, + "learning_rate": 2.0868266121958895e-06, + "loss": 1.0786, + "step": 3144 + }, + { + "epoch": 2.380321665089877, + "grad_norm": 1.94955313205719, + "learning_rate": 2.0819227825052655e-06, + "loss": 1.04, + "step": 3145 + }, + { + "epoch": 2.3810785241248817, + "grad_norm": 2.1713364124298096, + "learning_rate": 2.0770240573310464e-06, + "loss": 1.0997, + "step": 3146 + }, + { + "epoch": 2.3818353831598866, + "grad_norm": 1.9667205810546875, + "learning_rate": 2.07213043983088e-06, + "loss": 1.0526, + "step": 3147 + }, + { + "epoch": 2.382592242194891, + "grad_norm": 2.1327104568481445, + "learning_rate": 2.067241933159111e-06, + "loss": 1.0717, + "step": 3148 + }, + { + "epoch": 2.383349101229896, + "grad_norm": 2.0339972972869873, + "learning_rate": 2.0623585404668027e-06, + "loss": 1.0583, + "step": 3149 + }, + { + "epoch": 2.384105960264901, + "grad_norm": 2.172558069229126, + "learning_rate": 2.0574802649017087e-06, + "loss": 1.0814, + "step": 3150 + }, + { + "epoch": 2.3848628192999053, + "grad_norm": 2.1538596153259277, + "learning_rate": 2.0526071096082958e-06, + "loss": 1.0713, + "step": 3151 + }, + { + "epoch": 2.38561967833491, + "grad_norm": 2.009945869445801, + "learning_rate": 2.0477390777277238e-06, + "loss": 1.0783, + "step": 3152 + }, + { + "epoch": 2.386376537369915, + "grad_norm": 2.024836540222168, + "learning_rate": 2.042876172397855e-06, + "loss": 1.0648, + "step": 3153 + }, + { + "epoch": 2.3871333964049195, + "grad_norm": 1.9101126194000244, + "learning_rate": 2.0380183967532398e-06, + "loss": 1.0476, + "step": 3154 + }, + { + "epoch": 2.3878902554399244, + "grad_norm": 1.9727280139923096, + "learning_rate": 2.033165753925127e-06, + "loss": 1.0658, + "step": 3155 + }, + { + "epoch": 2.388647114474929, + "grad_norm": 2.0228683948516846, + "learning_rate": 2.0283182470414605e-06, + "loss": 1.0536, + "step": 3156 + }, + { + "epoch": 2.3894039735099337, + "grad_norm": 2.0721933841705322, + "learning_rate": 2.0234758792268626e-06, + "loss": 1.1227, + "step": 3157 + }, + { + "epoch": 2.3901608325449386, + "grad_norm": 2.15034818649292, + "learning_rate": 2.0186386536026563e-06, + "loss": 1.1274, + "step": 3158 + }, + { + "epoch": 2.390917691579943, + "grad_norm": 2.1228785514831543, + "learning_rate": 2.0138065732868377e-06, + "loss": 1.1003, + "step": 3159 + }, + { + "epoch": 2.391674550614948, + "grad_norm": 1.9539825916290283, + "learning_rate": 2.008979641394094e-06, + "loss": 1.0618, + "step": 3160 + }, + { + "epoch": 2.392431409649953, + "grad_norm": 2.054403066635132, + "learning_rate": 2.0041578610357924e-06, + "loss": 1.0761, + "step": 3161 + }, + { + "epoch": 2.3931882686849573, + "grad_norm": 2.2176952362060547, + "learning_rate": 1.9993412353199797e-06, + "loss": 1.0955, + "step": 3162 + }, + { + "epoch": 2.393945127719962, + "grad_norm": 1.9848977327346802, + "learning_rate": 1.9945297673513813e-06, + "loss": 1.0478, + "step": 3163 + }, + { + "epoch": 2.394701986754967, + "grad_norm": 2.0067944526672363, + "learning_rate": 1.9897234602313935e-06, + "loss": 1.0606, + "step": 3164 + }, + { + "epoch": 2.3954588457899715, + "grad_norm": 2.1978671550750732, + "learning_rate": 1.9849223170580863e-06, + "loss": 1.0673, + "step": 3165 + }, + { + "epoch": 2.3962157048249764, + "grad_norm": 2.103545665740967, + "learning_rate": 1.9801263409262044e-06, + "loss": 1.0697, + "step": 3166 + }, + { + "epoch": 2.3969725638599813, + "grad_norm": 2.0121638774871826, + "learning_rate": 1.975335534927164e-06, + "loss": 1.1051, + "step": 3167 + }, + { + "epoch": 2.3977294228949857, + "grad_norm": 2.227635383605957, + "learning_rate": 1.970549902149043e-06, + "loss": 1.1104, + "step": 3168 + }, + { + "epoch": 2.3984862819299906, + "grad_norm": 1.98334538936615, + "learning_rate": 1.965769445676593e-06, + "loss": 1.0538, + "step": 3169 + }, + { + "epoch": 2.3992431409649955, + "grad_norm": 2.092841863632202, + "learning_rate": 1.9609941685912137e-06, + "loss": 1.076, + "step": 3170 + }, + { + "epoch": 2.4, + "grad_norm": 1.9714951515197754, + "learning_rate": 1.9562240739709797e-06, + "loss": 1.0408, + "step": 3171 + }, + { + "epoch": 2.400756859035005, + "grad_norm": 2.228931427001953, + "learning_rate": 1.951459164890623e-06, + "loss": 1.07, + "step": 3172 + }, + { + "epoch": 2.4015137180700092, + "grad_norm": 2.2674131393432617, + "learning_rate": 1.9466994444215306e-06, + "loss": 1.0301, + "step": 3173 + }, + { + "epoch": 2.402270577105014, + "grad_norm": 2.017943859100342, + "learning_rate": 1.941944915631745e-06, + "loss": 1.0159, + "step": 3174 + }, + { + "epoch": 2.403027436140019, + "grad_norm": 2.2940430641174316, + "learning_rate": 1.937195581585966e-06, + "loss": 1.0656, + "step": 3175 + }, + { + "epoch": 2.403784295175024, + "grad_norm": 2.035090684890747, + "learning_rate": 1.9324514453455404e-06, + "loss": 1.0572, + "step": 3176 + }, + { + "epoch": 2.4045411542100283, + "grad_norm": 2.0770013332366943, + "learning_rate": 1.927712509968461e-06, + "loss": 1.0763, + "step": 3177 + }, + { + "epoch": 2.4052980132450332, + "grad_norm": 2.1525466442108154, + "learning_rate": 1.9229787785093784e-06, + "loss": 1.069, + "step": 3178 + }, + { + "epoch": 2.4060548722800377, + "grad_norm": 1.9798002243041992, + "learning_rate": 1.9182502540195826e-06, + "loss": 1.04, + "step": 3179 + }, + { + "epoch": 2.4068117313150426, + "grad_norm": 2.0167088508605957, + "learning_rate": 1.9135269395470117e-06, + "loss": 1.0951, + "step": 3180 + }, + { + "epoch": 2.4075685903500474, + "grad_norm": 1.9377844333648682, + "learning_rate": 1.908808838136235e-06, + "loss": 1.0678, + "step": 3181 + }, + { + "epoch": 2.408325449385052, + "grad_norm": 1.8350154161453247, + "learning_rate": 1.904095952828474e-06, + "loss": 1.0734, + "step": 3182 + }, + { + "epoch": 2.4090823084200568, + "grad_norm": 2.102295160293579, + "learning_rate": 1.8993882866615832e-06, + "loss": 1.0446, + "step": 3183 + }, + { + "epoch": 2.4098391674550617, + "grad_norm": 2.003739833831787, + "learning_rate": 1.8946858426700479e-06, + "loss": 1.1126, + "step": 3184 + }, + { + "epoch": 2.410596026490066, + "grad_norm": 2.410670280456543, + "learning_rate": 1.8899886238849949e-06, + "loss": 1.0803, + "step": 3185 + }, + { + "epoch": 2.411352885525071, + "grad_norm": 1.9924821853637695, + "learning_rate": 1.8852966333341822e-06, + "loss": 1.0713, + "step": 3186 + }, + { + "epoch": 2.412109744560076, + "grad_norm": 2.1071012020111084, + "learning_rate": 1.880609874041989e-06, + "loss": 1.0772, + "step": 3187 + }, + { + "epoch": 2.4128666035950803, + "grad_norm": 2.1172494888305664, + "learning_rate": 1.8759283490294333e-06, + "loss": 1.0584, + "step": 3188 + }, + { + "epoch": 2.413623462630085, + "grad_norm": 2.2310361862182617, + "learning_rate": 1.8712520613141525e-06, + "loss": 1.039, + "step": 3189 + }, + { + "epoch": 2.4143803216650896, + "grad_norm": 1.90047287940979, + "learning_rate": 1.8665810139104157e-06, + "loss": 1.0776, + "step": 3190 + }, + { + "epoch": 2.4151371807000945, + "grad_norm": 2.110424041748047, + "learning_rate": 1.8619152098291044e-06, + "loss": 1.0546, + "step": 3191 + }, + { + "epoch": 2.4158940397350994, + "grad_norm": 1.9257051944732666, + "learning_rate": 1.8572546520777214e-06, + "loss": 1.0843, + "step": 3192 + }, + { + "epoch": 2.4166508987701043, + "grad_norm": 2.1789603233337402, + "learning_rate": 1.852599343660396e-06, + "loss": 1.0479, + "step": 3193 + }, + { + "epoch": 2.4174077578051087, + "grad_norm": 1.8343688249588013, + "learning_rate": 1.847949287577868e-06, + "loss": 0.9984, + "step": 3194 + }, + { + "epoch": 2.4181646168401136, + "grad_norm": 2.1044609546661377, + "learning_rate": 1.843304486827492e-06, + "loss": 1.0603, + "step": 3195 + }, + { + "epoch": 2.418921475875118, + "grad_norm": 2.0383095741271973, + "learning_rate": 1.838664944403236e-06, + "loss": 1.0733, + "step": 3196 + }, + { + "epoch": 2.419678334910123, + "grad_norm": 1.9661799669265747, + "learning_rate": 1.8340306632956847e-06, + "loss": 1.0257, + "step": 3197 + }, + { + "epoch": 2.420435193945128, + "grad_norm": 2.1200876235961914, + "learning_rate": 1.8294016464920133e-06, + "loss": 1.0597, + "step": 3198 + }, + { + "epoch": 2.4211920529801323, + "grad_norm": 2.138803005218506, + "learning_rate": 1.8247778969760206e-06, + "loss": 1.0518, + "step": 3199 + }, + { + "epoch": 2.421948912015137, + "grad_norm": 1.937528371810913, + "learning_rate": 1.8201594177281053e-06, + "loss": 1.1165, + "step": 3200 + }, + { + "epoch": 2.422705771050142, + "grad_norm": 1.99111008644104, + "learning_rate": 1.8155462117252693e-06, + "loss": 1.0687, + "step": 3201 + }, + { + "epoch": 2.4234626300851465, + "grad_norm": 2.2547271251678467, + "learning_rate": 1.8109382819411164e-06, + "loss": 1.0613, + "step": 3202 + }, + { + "epoch": 2.4242194891201514, + "grad_norm": 1.8853436708450317, + "learning_rate": 1.8063356313458443e-06, + "loss": 1.1003, + "step": 3203 + }, + { + "epoch": 2.4249763481551563, + "grad_norm": 1.9870060682296753, + "learning_rate": 1.801738262906254e-06, + "loss": 1.0924, + "step": 3204 + }, + { + "epoch": 2.4257332071901607, + "grad_norm": 1.9391242265701294, + "learning_rate": 1.7971461795857367e-06, + "loss": 1.0116, + "step": 3205 + }, + { + "epoch": 2.4264900662251656, + "grad_norm": 2.092609167098999, + "learning_rate": 1.7925593843442798e-06, + "loss": 1.1001, + "step": 3206 + }, + { + "epoch": 2.42724692526017, + "grad_norm": 2.029475212097168, + "learning_rate": 1.787977880138463e-06, + "loss": 1.0716, + "step": 3207 + }, + { + "epoch": 2.428003784295175, + "grad_norm": 2.054161787033081, + "learning_rate": 1.783401669921456e-06, + "loss": 1.0752, + "step": 3208 + }, + { + "epoch": 2.42876064333018, + "grad_norm": 1.762110710144043, + "learning_rate": 1.7788307566430083e-06, + "loss": 1.0639, + "step": 3209 + }, + { + "epoch": 2.4295175023651847, + "grad_norm": 2.044447898864746, + "learning_rate": 1.774265143249467e-06, + "loss": 1.0905, + "step": 3210 + }, + { + "epoch": 2.430274361400189, + "grad_norm": 2.024933338165283, + "learning_rate": 1.7697048326837516e-06, + "loss": 1.0152, + "step": 3211 + }, + { + "epoch": 2.431031220435194, + "grad_norm": 1.9569860696792603, + "learning_rate": 1.7651498278853708e-06, + "loss": 1.0603, + "step": 3212 + }, + { + "epoch": 2.4317880794701985, + "grad_norm": 2.0603296756744385, + "learning_rate": 1.760600131790414e-06, + "loss": 1.1086, + "step": 3213 + }, + { + "epoch": 2.4325449385052034, + "grad_norm": 1.8924018144607544, + "learning_rate": 1.7560557473315413e-06, + "loss": 1.0509, + "step": 3214 + }, + { + "epoch": 2.4333017975402083, + "grad_norm": 1.9490152597427368, + "learning_rate": 1.7515166774379947e-06, + "loss": 1.0518, + "step": 3215 + }, + { + "epoch": 2.4340586565752127, + "grad_norm": 2.046515703201294, + "learning_rate": 1.746982925035591e-06, + "loss": 1.0466, + "step": 3216 + }, + { + "epoch": 2.4348155156102176, + "grad_norm": 2.0436580181121826, + "learning_rate": 1.7424544930467205e-06, + "loss": 1.0642, + "step": 3217 + }, + { + "epoch": 2.4355723746452225, + "grad_norm": 2.1087872982025146, + "learning_rate": 1.7379313843903357e-06, + "loss": 1.0838, + "step": 3218 + }, + { + "epoch": 2.436329233680227, + "grad_norm": 1.896474838256836, + "learning_rate": 1.7334136019819681e-06, + "loss": 1.0678, + "step": 3219 + }, + { + "epoch": 2.437086092715232, + "grad_norm": 2.087778091430664, + "learning_rate": 1.7289011487337059e-06, + "loss": 1.0402, + "step": 3220 + }, + { + "epoch": 2.4378429517502367, + "grad_norm": 1.9922432899475098, + "learning_rate": 1.7243940275542126e-06, + "loss": 1.119, + "step": 3221 + }, + { + "epoch": 2.438599810785241, + "grad_norm": 2.1278886795043945, + "learning_rate": 1.7198922413487073e-06, + "loss": 1.1303, + "step": 3222 + }, + { + "epoch": 2.439356669820246, + "grad_norm": 2.059356689453125, + "learning_rate": 1.7153957930189735e-06, + "loss": 1.0732, + "step": 3223 + }, + { + "epoch": 2.440113528855251, + "grad_norm": 2.127638578414917, + "learning_rate": 1.7109046854633587e-06, + "loss": 1.0715, + "step": 3224 + }, + { + "epoch": 2.4408703878902553, + "grad_norm": 2.0281498432159424, + "learning_rate": 1.7064189215767526e-06, + "loss": 1.0378, + "step": 3225 + }, + { + "epoch": 2.4416272469252602, + "grad_norm": 2.8992788791656494, + "learning_rate": 1.7019385042506134e-06, + "loss": 1.0982, + "step": 3226 + }, + { + "epoch": 2.442384105960265, + "grad_norm": 2.046388626098633, + "learning_rate": 1.697463436372951e-06, + "loss": 1.0689, + "step": 3227 + }, + { + "epoch": 2.4431409649952696, + "grad_norm": 1.9682432413101196, + "learning_rate": 1.692993720828327e-06, + "loss": 1.0886, + "step": 3228 + }, + { + "epoch": 2.4438978240302744, + "grad_norm": 2.190717935562134, + "learning_rate": 1.6885293604978495e-06, + "loss": 1.0941, + "step": 3229 + }, + { + "epoch": 2.444654683065279, + "grad_norm": 2.263735294342041, + "learning_rate": 1.6840703582591808e-06, + "loss": 1.1485, + "step": 3230 + }, + { + "epoch": 2.4454115421002838, + "grad_norm": 1.917043924331665, + "learning_rate": 1.6796167169865243e-06, + "loss": 1.044, + "step": 3231 + }, + { + "epoch": 2.4461684011352887, + "grad_norm": 2.120823621749878, + "learning_rate": 1.6751684395506248e-06, + "loss": 1.0492, + "step": 3232 + }, + { + "epoch": 2.446925260170293, + "grad_norm": 1.9636114835739136, + "learning_rate": 1.6707255288187776e-06, + "loss": 1.0525, + "step": 3233 + }, + { + "epoch": 2.447682119205298, + "grad_norm": 2.068773031234741, + "learning_rate": 1.6662879876548164e-06, + "loss": 1.1185, + "step": 3234 + }, + { + "epoch": 2.448438978240303, + "grad_norm": 1.9608315229415894, + "learning_rate": 1.661855818919112e-06, + "loss": 1.0705, + "step": 3235 + }, + { + "epoch": 2.4491958372753073, + "grad_norm": 2.037750005722046, + "learning_rate": 1.65742902546857e-06, + "loss": 1.1109, + "step": 3236 + }, + { + "epoch": 2.449952696310312, + "grad_norm": 2.2666871547698975, + "learning_rate": 1.653007610156637e-06, + "loss": 1.0955, + "step": 3237 + }, + { + "epoch": 2.450709555345317, + "grad_norm": 2.0493760108947754, + "learning_rate": 1.6485915758332899e-06, + "loss": 1.0354, + "step": 3238 + }, + { + "epoch": 2.4514664143803215, + "grad_norm": 2.0443224906921387, + "learning_rate": 1.6441809253450347e-06, + "loss": 1.0605, + "step": 3239 + }, + { + "epoch": 2.4522232734153264, + "grad_norm": 1.7710448503494263, + "learning_rate": 1.6397756615349103e-06, + "loss": 1.064, + "step": 3240 + }, + { + "epoch": 2.4529801324503313, + "grad_norm": 2.0574166774749756, + "learning_rate": 1.6353757872424848e-06, + "loss": 1.0986, + "step": 3241 + }, + { + "epoch": 2.4537369914853357, + "grad_norm": 2.0350000858306885, + "learning_rate": 1.6309813053038476e-06, + "loss": 1.0722, + "step": 3242 + }, + { + "epoch": 2.4544938505203406, + "grad_norm": 2.178621530532837, + "learning_rate": 1.6265922185516136e-06, + "loss": 1.0777, + "step": 3243 + }, + { + "epoch": 2.4552507095553455, + "grad_norm": 1.874701976776123, + "learning_rate": 1.6222085298149237e-06, + "loss": 1.0678, + "step": 3244 + }, + { + "epoch": 2.45600756859035, + "grad_norm": 2.080073356628418, + "learning_rate": 1.617830241919439e-06, + "loss": 1.1109, + "step": 3245 + }, + { + "epoch": 2.456764427625355, + "grad_norm": 2.067389488220215, + "learning_rate": 1.6134573576873347e-06, + "loss": 1.0613, + "step": 3246 + }, + { + "epoch": 2.4575212866603593, + "grad_norm": 2.2184066772460938, + "learning_rate": 1.6090898799373013e-06, + "loss": 1.0445, + "step": 3247 + }, + { + "epoch": 2.458278145695364, + "grad_norm": 2.1395821571350098, + "learning_rate": 1.6047278114845524e-06, + "loss": 1.1291, + "step": 3248 + }, + { + "epoch": 2.459035004730369, + "grad_norm": 1.878059983253479, + "learning_rate": 1.6003711551408108e-06, + "loss": 1.1197, + "step": 3249 + }, + { + "epoch": 2.4597918637653735, + "grad_norm": 2.079202651977539, + "learning_rate": 1.5960199137143096e-06, + "loss": 1.0601, + "step": 3250 + }, + { + "epoch": 2.4605487228003784, + "grad_norm": 2.1114516258239746, + "learning_rate": 1.5916740900097936e-06, + "loss": 1.0981, + "step": 3251 + }, + { + "epoch": 2.4613055818353833, + "grad_norm": 2.0840392112731934, + "learning_rate": 1.5873336868285188e-06, + "loss": 1.0901, + "step": 3252 + }, + { + "epoch": 2.4620624408703877, + "grad_norm": 2.1868133544921875, + "learning_rate": 1.582998706968233e-06, + "loss": 1.1103, + "step": 3253 + }, + { + "epoch": 2.4628192999053926, + "grad_norm": 2.0153892040252686, + "learning_rate": 1.5786691532232047e-06, + "loss": 1.0825, + "step": 3254 + }, + { + "epoch": 2.4635761589403975, + "grad_norm": 2.147407054901123, + "learning_rate": 1.5743450283841957e-06, + "loss": 1.0529, + "step": 3255 + }, + { + "epoch": 2.464333017975402, + "grad_norm": 2.2476887702941895, + "learning_rate": 1.5700263352384732e-06, + "loss": 1.0551, + "step": 3256 + }, + { + "epoch": 2.465089877010407, + "grad_norm": 1.897383689880371, + "learning_rate": 1.5657130765698006e-06, + "loss": 1.0773, + "step": 3257 + }, + { + "epoch": 2.4658467360454117, + "grad_norm": 2.2343618869781494, + "learning_rate": 1.56140525515844e-06, + "loss": 1.0388, + "step": 3258 + }, + { + "epoch": 2.466603595080416, + "grad_norm": 2.182474136352539, + "learning_rate": 1.5571028737811414e-06, + "loss": 1.0837, + "step": 3259 + }, + { + "epoch": 2.467360454115421, + "grad_norm": 1.94349205493927, + "learning_rate": 1.5528059352111586e-06, + "loss": 1.0374, + "step": 3260 + }, + { + "epoch": 2.468117313150426, + "grad_norm": 2.3165524005889893, + "learning_rate": 1.5485144422182325e-06, + "loss": 1.049, + "step": 3261 + }, + { + "epoch": 2.4688741721854304, + "grad_norm": 2.2094292640686035, + "learning_rate": 1.5442283975685937e-06, + "loss": 1.0934, + "step": 3262 + }, + { + "epoch": 2.4696310312204353, + "grad_norm": 2.0244195461273193, + "learning_rate": 1.5399478040249638e-06, + "loss": 1.044, + "step": 3263 + }, + { + "epoch": 2.4703878902554397, + "grad_norm": 1.9300179481506348, + "learning_rate": 1.5356726643465427e-06, + "loss": 1.1156, + "step": 3264 + }, + { + "epoch": 2.4711447492904446, + "grad_norm": 2.06846022605896, + "learning_rate": 1.5314029812890258e-06, + "loss": 1.012, + "step": 3265 + }, + { + "epoch": 2.4719016083254495, + "grad_norm": 2.2604005336761475, + "learning_rate": 1.5271387576045804e-06, + "loss": 1.1042, + "step": 3266 + }, + { + "epoch": 2.4726584673604544, + "grad_norm": 2.3489127159118652, + "learning_rate": 1.5228799960418639e-06, + "loss": 1.05, + "step": 3267 + }, + { + "epoch": 2.473415326395459, + "grad_norm": 2.0200610160827637, + "learning_rate": 1.518626699346009e-06, + "loss": 1.1298, + "step": 3268 + }, + { + "epoch": 2.4741721854304637, + "grad_norm": 2.0748353004455566, + "learning_rate": 1.514378870258623e-06, + "loss": 1.0477, + "step": 3269 + }, + { + "epoch": 2.474929044465468, + "grad_norm": 2.0303914546966553, + "learning_rate": 1.510136511517792e-06, + "loss": 1.0319, + "step": 3270 + }, + { + "epoch": 2.475685903500473, + "grad_norm": 1.9617363214492798, + "learning_rate": 1.5058996258580788e-06, + "loss": 1.1149, + "step": 3271 + }, + { + "epoch": 2.476442762535478, + "grad_norm": 2.0544240474700928, + "learning_rate": 1.5016682160105153e-06, + "loss": 1.0733, + "step": 3272 + }, + { + "epoch": 2.4771996215704823, + "grad_norm": 2.3402349948883057, + "learning_rate": 1.4974422847026002e-06, + "loss": 1.0615, + "step": 3273 + }, + { + "epoch": 2.4779564806054872, + "grad_norm": 2.0029454231262207, + "learning_rate": 1.4932218346583082e-06, + "loss": 1.0688, + "step": 3274 + }, + { + "epoch": 2.478713339640492, + "grad_norm": 1.8487077951431274, + "learning_rate": 1.4890068685980732e-06, + "loss": 1.1071, + "step": 3275 + }, + { + "epoch": 2.4794701986754966, + "grad_norm": 2.282620906829834, + "learning_rate": 1.4847973892388003e-06, + "loss": 1.0802, + "step": 3276 + }, + { + "epoch": 2.4802270577105014, + "grad_norm": 1.9295916557312012, + "learning_rate": 1.4805933992938547e-06, + "loss": 1.0663, + "step": 3277 + }, + { + "epoch": 2.4809839167455063, + "grad_norm": 2.0331246852874756, + "learning_rate": 1.476394901473066e-06, + "loss": 1.0906, + "step": 3278 + }, + { + "epoch": 2.4817407757805108, + "grad_norm": 2.1244752407073975, + "learning_rate": 1.4722018984827247e-06, + "loss": 1.0655, + "step": 3279 + }, + { + "epoch": 2.4824976348155157, + "grad_norm": 2.140397787094116, + "learning_rate": 1.4680143930255675e-06, + "loss": 1.0969, + "step": 3280 + }, + { + "epoch": 2.48325449385052, + "grad_norm": 2.1430792808532715, + "learning_rate": 1.4638323878008022e-06, + "loss": 1.06, + "step": 3281 + }, + { + "epoch": 2.484011352885525, + "grad_norm": 2.28765606880188, + "learning_rate": 1.459655885504086e-06, + "loss": 1.1159, + "step": 3282 + }, + { + "epoch": 2.48476821192053, + "grad_norm": 1.9069238901138306, + "learning_rate": 1.455484888827526e-06, + "loss": 1.0083, + "step": 3283 + }, + { + "epoch": 2.4855250709555348, + "grad_norm": 1.848893642425537, + "learning_rate": 1.4513194004596865e-06, + "loss": 1.0527, + "step": 3284 + }, + { + "epoch": 2.486281929990539, + "grad_norm": 1.8594064712524414, + "learning_rate": 1.4471594230855774e-06, + "loss": 1.0815, + "step": 3285 + }, + { + "epoch": 2.487038789025544, + "grad_norm": 1.9376791715621948, + "learning_rate": 1.4430049593866543e-06, + "loss": 1.0403, + "step": 3286 + }, + { + "epoch": 2.4877956480605485, + "grad_norm": 2.031545639038086, + "learning_rate": 1.4388560120408215e-06, + "loss": 1.0378, + "step": 3287 + }, + { + "epoch": 2.4885525070955534, + "grad_norm": 2.0290255546569824, + "learning_rate": 1.4347125837224266e-06, + "loss": 1.05, + "step": 3288 + }, + { + "epoch": 2.4893093661305583, + "grad_norm": 2.070533037185669, + "learning_rate": 1.4305746771022623e-06, + "loss": 1.0854, + "step": 3289 + }, + { + "epoch": 2.4900662251655628, + "grad_norm": 3.2161612510681152, + "learning_rate": 1.4264422948475618e-06, + "loss": 1.0484, + "step": 3290 + }, + { + "epoch": 2.4908230842005676, + "grad_norm": 2.044058322906494, + "learning_rate": 1.4223154396219906e-06, + "loss": 1.0543, + "step": 3291 + }, + { + "epoch": 2.4915799432355725, + "grad_norm": 1.9972931146621704, + "learning_rate": 1.4181941140856595e-06, + "loss": 1.0482, + "step": 3292 + }, + { + "epoch": 2.492336802270577, + "grad_norm": 2.115438222885132, + "learning_rate": 1.4140783208951142e-06, + "loss": 1.1006, + "step": 3293 + }, + { + "epoch": 2.493093661305582, + "grad_norm": 1.9351952075958252, + "learning_rate": 1.4099680627033266e-06, + "loss": 1.0229, + "step": 3294 + }, + { + "epoch": 2.4938505203405867, + "grad_norm": 2.0593087673187256, + "learning_rate": 1.4058633421597104e-06, + "loss": 1.0653, + "step": 3295 + }, + { + "epoch": 2.494607379375591, + "grad_norm": 2.1452414989471436, + "learning_rate": 1.4017641619101074e-06, + "loss": 1.1039, + "step": 3296 + }, + { + "epoch": 2.495364238410596, + "grad_norm": 1.799978494644165, + "learning_rate": 1.3976705245967832e-06, + "loss": 1.0778, + "step": 3297 + }, + { + "epoch": 2.4961210974456005, + "grad_norm": 1.9537346363067627, + "learning_rate": 1.3935824328584335e-06, + "loss": 1.0688, + "step": 3298 + }, + { + "epoch": 2.4968779564806054, + "grad_norm": 2.1485798358917236, + "learning_rate": 1.3894998893301829e-06, + "loss": 1.0416, + "step": 3299 + }, + { + "epoch": 2.4976348155156103, + "grad_norm": 2.167556047439575, + "learning_rate": 1.3854228966435768e-06, + "loss": 1.0693, + "step": 3300 + }, + { + "epoch": 2.498391674550615, + "grad_norm": 2.1335930824279785, + "learning_rate": 1.3813514574265815e-06, + "loss": 1.0789, + "step": 3301 + }, + { + "epoch": 2.4991485335856196, + "grad_norm": 1.9826858043670654, + "learning_rate": 1.3772855743035818e-06, + "loss": 1.071, + "step": 3302 + }, + { + "epoch": 2.4999053926206245, + "grad_norm": 1.9714877605438232, + "learning_rate": 1.3732252498953874e-06, + "loss": 1.0915, + "step": 3303 + }, + { + "epoch": 2.500662251655629, + "grad_norm": 1.859167218208313, + "learning_rate": 1.3691704868192202e-06, + "loss": 1.0345, + "step": 3304 + }, + { + "epoch": 2.501419110690634, + "grad_norm": 1.9658515453338623, + "learning_rate": 1.3651212876887181e-06, + "loss": 1.0506, + "step": 3305 + }, + { + "epoch": 2.5021759697256387, + "grad_norm": 2.1254079341888428, + "learning_rate": 1.361077655113935e-06, + "loss": 1.0721, + "step": 3306 + }, + { + "epoch": 2.5029328287606436, + "grad_norm": 2.0176279544830322, + "learning_rate": 1.3570395917013365e-06, + "loss": 1.0711, + "step": 3307 + }, + { + "epoch": 2.503689687795648, + "grad_norm": 2.091989278793335, + "learning_rate": 1.353007100053791e-06, + "loss": 1.1103, + "step": 3308 + }, + { + "epoch": 2.504446546830653, + "grad_norm": 2.0587007999420166, + "learning_rate": 1.348980182770584e-06, + "loss": 1.0569, + "step": 3309 + }, + { + "epoch": 2.5052034058656574, + "grad_norm": 1.9162755012512207, + "learning_rate": 1.344958842447405e-06, + "loss": 1.0914, + "step": 3310 + }, + { + "epoch": 2.5059602649006623, + "grad_norm": 1.995250940322876, + "learning_rate": 1.3409430816763478e-06, + "loss": 1.0789, + "step": 3311 + }, + { + "epoch": 2.506717123935667, + "grad_norm": 2.189922571182251, + "learning_rate": 1.3369329030459152e-06, + "loss": 1.1112, + "step": 3312 + }, + { + "epoch": 2.5074739829706716, + "grad_norm": 2.006410837173462, + "learning_rate": 1.3329283091410014e-06, + "loss": 1.0911, + "step": 3313 + }, + { + "epoch": 2.5082308420056765, + "grad_norm": 1.9690699577331543, + "learning_rate": 1.3289293025429082e-06, + "loss": 1.0642, + "step": 3314 + }, + { + "epoch": 2.508987701040681, + "grad_norm": 1.9968689680099487, + "learning_rate": 1.324935885829334e-06, + "loss": 1.1037, + "step": 3315 + }, + { + "epoch": 2.509744560075686, + "grad_norm": 2.0873682498931885, + "learning_rate": 1.3209480615743746e-06, + "loss": 1.0804, + "step": 3316 + }, + { + "epoch": 2.5105014191106907, + "grad_norm": 2.100817918777466, + "learning_rate": 1.3169658323485212e-06, + "loss": 1.0313, + "step": 3317 + }, + { + "epoch": 2.5112582781456956, + "grad_norm": 2.1082022190093994, + "learning_rate": 1.3129892007186602e-06, + "loss": 1.0815, + "step": 3318 + }, + { + "epoch": 2.5120151371807, + "grad_norm": 2.102774143218994, + "learning_rate": 1.3090181692480642e-06, + "loss": 1.0529, + "step": 3319 + }, + { + "epoch": 2.512771996215705, + "grad_norm": 1.8931456804275513, + "learning_rate": 1.305052740496402e-06, + "loss": 1.029, + "step": 3320 + }, + { + "epoch": 2.5135288552507093, + "grad_norm": 1.9600942134857178, + "learning_rate": 1.301092917019724e-06, + "loss": 1.0499, + "step": 3321 + }, + { + "epoch": 2.5142857142857142, + "grad_norm": 1.9435330629348755, + "learning_rate": 1.2971387013704767e-06, + "loss": 1.0662, + "step": 3322 + }, + { + "epoch": 2.515042573320719, + "grad_norm": 2.6212613582611084, + "learning_rate": 1.2931900960974872e-06, + "loss": 1.0569, + "step": 3323 + }, + { + "epoch": 2.515799432355724, + "grad_norm": 2.1485402584075928, + "learning_rate": 1.2892471037459634e-06, + "loss": 1.0798, + "step": 3324 + }, + { + "epoch": 2.5165562913907285, + "grad_norm": 1.9852415323257446, + "learning_rate": 1.285309726857499e-06, + "loss": 1.0856, + "step": 3325 + }, + { + "epoch": 2.5173131504257333, + "grad_norm": 2.0448715686798096, + "learning_rate": 1.281377967970067e-06, + "loss": 1.0894, + "step": 3326 + }, + { + "epoch": 2.518070009460738, + "grad_norm": 1.966200590133667, + "learning_rate": 1.2774518296180222e-06, + "loss": 1.0583, + "step": 3327 + }, + { + "epoch": 2.5188268684957427, + "grad_norm": 2.0975255966186523, + "learning_rate": 1.2735313143320901e-06, + "loss": 1.1087, + "step": 3328 + }, + { + "epoch": 2.5195837275307476, + "grad_norm": 1.8325495719909668, + "learning_rate": 1.2696164246393766e-06, + "loss": 1.0857, + "step": 3329 + }, + { + "epoch": 2.520340586565752, + "grad_norm": 2.3163001537323, + "learning_rate": 1.265707163063358e-06, + "loss": 1.0535, + "step": 3330 + }, + { + "epoch": 2.521097445600757, + "grad_norm": 2.0799732208251953, + "learning_rate": 1.2618035321238856e-06, + "loss": 1.1036, + "step": 3331 + }, + { + "epoch": 2.5218543046357613, + "grad_norm": 1.9881048202514648, + "learning_rate": 1.257905534337181e-06, + "loss": 1.0693, + "step": 3332 + }, + { + "epoch": 2.522611163670766, + "grad_norm": 2.5260820388793945, + "learning_rate": 1.2540131722158336e-06, + "loss": 1.0339, + "step": 3333 + }, + { + "epoch": 2.523368022705771, + "grad_norm": 2.0032739639282227, + "learning_rate": 1.2501264482688052e-06, + "loss": 1.0888, + "step": 3334 + }, + { + "epoch": 2.524124881740776, + "grad_norm": 1.9609510898590088, + "learning_rate": 1.2462453650014107e-06, + "loss": 1.0654, + "step": 3335 + }, + { + "epoch": 2.5248817407757804, + "grad_norm": 2.018681049346924, + "learning_rate": 1.2423699249153408e-06, + "loss": 1.0763, + "step": 3336 + }, + { + "epoch": 2.5256385998107853, + "grad_norm": 1.9069342613220215, + "learning_rate": 1.2385001305086455e-06, + "loss": 1.0883, + "step": 3337 + }, + { + "epoch": 2.5263954588457898, + "grad_norm": 2.1848833560943604, + "learning_rate": 1.2346359842757345e-06, + "loss": 1.0894, + "step": 3338 + }, + { + "epoch": 2.5271523178807946, + "grad_norm": 2.210022449493408, + "learning_rate": 1.230777488707379e-06, + "loss": 1.0807, + "step": 3339 + }, + { + "epoch": 2.5279091769157995, + "grad_norm": 2.1791112422943115, + "learning_rate": 1.2269246462907065e-06, + "loss": 1.0707, + "step": 3340 + }, + { + "epoch": 2.5286660359508044, + "grad_norm": 1.9715594053268433, + "learning_rate": 1.2230774595092005e-06, + "loss": 1.0304, + "step": 3341 + }, + { + "epoch": 2.529422894985809, + "grad_norm": 2.082552433013916, + "learning_rate": 1.219235930842696e-06, + "loss": 1.088, + "step": 3342 + }, + { + "epoch": 2.5301797540208137, + "grad_norm": 2.0022144317626953, + "learning_rate": 1.215400062767385e-06, + "loss": 1.0153, + "step": 3343 + }, + { + "epoch": 2.530936613055818, + "grad_norm": 2.0545942783355713, + "learning_rate": 1.2115698577558096e-06, + "loss": 1.0895, + "step": 3344 + }, + { + "epoch": 2.531693472090823, + "grad_norm": 1.8354177474975586, + "learning_rate": 1.207745318276865e-06, + "loss": 1.082, + "step": 3345 + }, + { + "epoch": 2.532450331125828, + "grad_norm": 2.023404121398926, + "learning_rate": 1.203926446795787e-06, + "loss": 1.0675, + "step": 3346 + }, + { + "epoch": 2.5332071901608324, + "grad_norm": 1.8171032667160034, + "learning_rate": 1.2001132457741615e-06, + "loss": 1.0464, + "step": 3347 + }, + { + "epoch": 2.5339640491958373, + "grad_norm": 1.9232815504074097, + "learning_rate": 1.1963057176699249e-06, + "loss": 1.0647, + "step": 3348 + }, + { + "epoch": 2.5347209082308417, + "grad_norm": 1.9270484447479248, + "learning_rate": 1.1925038649373456e-06, + "loss": 1.1044, + "step": 3349 + }, + { + "epoch": 2.5354777672658466, + "grad_norm": 2.1430654525756836, + "learning_rate": 1.1887076900270418e-06, + "loss": 1.0809, + "step": 3350 + }, + { + "epoch": 2.5362346263008515, + "grad_norm": 2.0014898777008057, + "learning_rate": 1.1849171953859737e-06, + "loss": 1.0902, + "step": 3351 + }, + { + "epoch": 2.5369914853358564, + "grad_norm": 2.0257580280303955, + "learning_rate": 1.1811323834574302e-06, + "loss": 1.0911, + "step": 3352 + }, + { + "epoch": 2.537748344370861, + "grad_norm": 1.9925347566604614, + "learning_rate": 1.1773532566810477e-06, + "loss": 1.0666, + "step": 3353 + }, + { + "epoch": 2.5385052034058657, + "grad_norm": 2.072805404663086, + "learning_rate": 1.1735798174927917e-06, + "loss": 1.1, + "step": 3354 + }, + { + "epoch": 2.53926206244087, + "grad_norm": 2.0256335735321045, + "learning_rate": 1.1698120683249663e-06, + "loss": 1.0585, + "step": 3355 + }, + { + "epoch": 2.540018921475875, + "grad_norm": 2.217489004135132, + "learning_rate": 1.1660500116062037e-06, + "loss": 1.0761, + "step": 3356 + }, + { + "epoch": 2.54077578051088, + "grad_norm": 1.91505765914917, + "learning_rate": 1.1622936497614644e-06, + "loss": 1.0579, + "step": 3357 + }, + { + "epoch": 2.541532639545885, + "grad_norm": 1.9349195957183838, + "learning_rate": 1.1585429852120462e-06, + "loss": 1.074, + "step": 3358 + }, + { + "epoch": 2.5422894985808893, + "grad_norm": 1.915028691291809, + "learning_rate": 1.1547980203755697e-06, + "loss": 1.0717, + "step": 3359 + }, + { + "epoch": 2.543046357615894, + "grad_norm": 2.154155731201172, + "learning_rate": 1.1510587576659814e-06, + "loss": 1.057, + "step": 3360 + }, + { + "epoch": 2.5438032166508986, + "grad_norm": 2.6311988830566406, + "learning_rate": 1.1473251994935532e-06, + "loss": 1.1184, + "step": 3361 + }, + { + "epoch": 2.5445600756859035, + "grad_norm": 2.038876533508301, + "learning_rate": 1.1435973482648844e-06, + "loss": 1.0693, + "step": 3362 + }, + { + "epoch": 2.5453169347209084, + "grad_norm": 2.0545339584350586, + "learning_rate": 1.1398752063828815e-06, + "loss": 1.0887, + "step": 3363 + }, + { + "epoch": 2.546073793755913, + "grad_norm": 1.8213523626327515, + "learning_rate": 1.1361587762467873e-06, + "loss": 1.0866, + "step": 3364 + }, + { + "epoch": 2.5468306527909177, + "grad_norm": 2.055341958999634, + "learning_rate": 1.1324480602521524e-06, + "loss": 1.0923, + "step": 3365 + }, + { + "epoch": 2.5475875118259226, + "grad_norm": 2.2624013423919678, + "learning_rate": 1.1287430607908508e-06, + "loss": 1.0532, + "step": 3366 + }, + { + "epoch": 2.548344370860927, + "grad_norm": 2.7478582859039307, + "learning_rate": 1.1250437802510686e-06, + "loss": 1.0849, + "step": 3367 + }, + { + "epoch": 2.549101229895932, + "grad_norm": 1.8595587015151978, + "learning_rate": 1.1213502210173044e-06, + "loss": 1.0518, + "step": 3368 + }, + { + "epoch": 2.549858088930937, + "grad_norm": 2.0965359210968018, + "learning_rate": 1.1176623854703688e-06, + "loss": 1.0152, + "step": 3369 + }, + { + "epoch": 2.5506149479659412, + "grad_norm": 2.1459243297576904, + "learning_rate": 1.1139802759873852e-06, + "loss": 1.1309, + "step": 3370 + }, + { + "epoch": 2.551371807000946, + "grad_norm": 1.981099247932434, + "learning_rate": 1.110303894941786e-06, + "loss": 1.1094, + "step": 3371 + }, + { + "epoch": 2.5521286660359506, + "grad_norm": 2.0165510177612305, + "learning_rate": 1.10663324470331e-06, + "loss": 1.0812, + "step": 3372 + }, + { + "epoch": 2.5528855250709555, + "grad_norm": 1.839483380317688, + "learning_rate": 1.102968327638005e-06, + "loss": 1.0098, + "step": 3373 + }, + { + "epoch": 2.5536423841059603, + "grad_norm": 2.0099551677703857, + "learning_rate": 1.0993091461082154e-06, + "loss": 1.0111, + "step": 3374 + }, + { + "epoch": 2.5543992431409652, + "grad_norm": 2.1071789264678955, + "learning_rate": 1.0956557024725986e-06, + "loss": 1.018, + "step": 3375 + }, + { + "epoch": 2.5551561021759697, + "grad_norm": 2.2769486904144287, + "learning_rate": 1.0920079990861043e-06, + "loss": 1.0538, + "step": 3376 + }, + { + "epoch": 2.5559129612109746, + "grad_norm": 1.969045877456665, + "learning_rate": 1.088366038299989e-06, + "loss": 1.0186, + "step": 3377 + }, + { + "epoch": 2.556669820245979, + "grad_norm": 2.0804572105407715, + "learning_rate": 1.0847298224618053e-06, + "loss": 1.0581, + "step": 3378 + }, + { + "epoch": 2.557426679280984, + "grad_norm": 1.9232372045516968, + "learning_rate": 1.081099353915403e-06, + "loss": 1.0507, + "step": 3379 + }, + { + "epoch": 2.5581835383159888, + "grad_norm": 1.9603554010391235, + "learning_rate": 1.077474635000925e-06, + "loss": 1.0878, + "step": 3380 + }, + { + "epoch": 2.558940397350993, + "grad_norm": 2.1331593990325928, + "learning_rate": 1.07385566805481e-06, + "loss": 1.0751, + "step": 3381 + }, + { + "epoch": 2.559697256385998, + "grad_norm": 2.48708438873291, + "learning_rate": 1.070242455409791e-06, + "loss": 1.0929, + "step": 3382 + }, + { + "epoch": 2.560454115421003, + "grad_norm": 2.023110866546631, + "learning_rate": 1.066634999394886e-06, + "loss": 1.0477, + "step": 3383 + }, + { + "epoch": 2.5612109744560074, + "grad_norm": 2.201087236404419, + "learning_rate": 1.0630333023354118e-06, + "loss": 1.0736, + "step": 3384 + }, + { + "epoch": 2.5619678334910123, + "grad_norm": 2.0009500980377197, + "learning_rate": 1.0594373665529613e-06, + "loss": 1.0059, + "step": 3385 + }, + { + "epoch": 2.562724692526017, + "grad_norm": 1.9756640195846558, + "learning_rate": 1.0558471943654217e-06, + "loss": 1.0857, + "step": 3386 + }, + { + "epoch": 2.5634815515610216, + "grad_norm": 2.2636256217956543, + "learning_rate": 1.0522627880869646e-06, + "loss": 1.0848, + "step": 3387 + }, + { + "epoch": 2.5642384105960265, + "grad_norm": 2.0621442794799805, + "learning_rate": 1.0486841500280441e-06, + "loss": 1.0821, + "step": 3388 + }, + { + "epoch": 2.564995269631031, + "grad_norm": 2.0804009437561035, + "learning_rate": 1.0451112824953961e-06, + "loss": 1.072, + "step": 3389 + }, + { + "epoch": 2.565752128666036, + "grad_norm": 2.1010797023773193, + "learning_rate": 1.0415441877920349e-06, + "loss": 1.0384, + "step": 3390 + }, + { + "epoch": 2.5665089877010407, + "grad_norm": 2.0294950008392334, + "learning_rate": 1.037982868217254e-06, + "loss": 1.1007, + "step": 3391 + }, + { + "epoch": 2.5672658467360456, + "grad_norm": 2.2194080352783203, + "learning_rate": 1.0344273260666264e-06, + "loss": 1.0293, + "step": 3392 + }, + { + "epoch": 2.56802270577105, + "grad_norm": 2.0473580360412598, + "learning_rate": 1.0308775636320018e-06, + "loss": 1.0989, + "step": 3393 + }, + { + "epoch": 2.568779564806055, + "grad_norm": 2.1180901527404785, + "learning_rate": 1.027333583201503e-06, + "loss": 1.073, + "step": 3394 + }, + { + "epoch": 2.5695364238410594, + "grad_norm": 1.9611269235610962, + "learning_rate": 1.0237953870595262e-06, + "loss": 1.0638, + "step": 3395 + }, + { + "epoch": 2.5702932828760643, + "grad_norm": 2.031759023666382, + "learning_rate": 1.0202629774867378e-06, + "loss": 1.044, + "step": 3396 + }, + { + "epoch": 2.571050141911069, + "grad_norm": 2.155648708343506, + "learning_rate": 1.016736356760073e-06, + "loss": 1.0815, + "step": 3397 + }, + { + "epoch": 2.571807000946074, + "grad_norm": 2.0659499168395996, + "learning_rate": 1.0132155271527401e-06, + "loss": 1.0977, + "step": 3398 + }, + { + "epoch": 2.5725638599810785, + "grad_norm": 2.2170495986938477, + "learning_rate": 1.0097004909342112e-06, + "loss": 1.0449, + "step": 3399 + }, + { + "epoch": 2.5733207190160834, + "grad_norm": 1.9732736349105835, + "learning_rate": 1.0061912503702258e-06, + "loss": 1.0475, + "step": 3400 + }, + { + "epoch": 2.574077578051088, + "grad_norm": 1.9781739711761475, + "learning_rate": 1.0026878077227885e-06, + "loss": 1.0778, + "step": 3401 + }, + { + "epoch": 2.5748344370860927, + "grad_norm": 2.0298542976379395, + "learning_rate": 9.99190165250161e-07, + "loss": 1.0379, + "step": 3402 + }, + { + "epoch": 2.5755912961210976, + "grad_norm": 1.7894214391708374, + "learning_rate": 9.95698325206874e-07, + "loss": 1.069, + "step": 3403 + }, + { + "epoch": 2.576348155156102, + "grad_norm": 2.022477865219116, + "learning_rate": 9.922122898437122e-07, + "loss": 1.0623, + "step": 3404 + }, + { + "epoch": 2.577105014191107, + "grad_norm": 1.8968234062194824, + "learning_rate": 9.887320614077198e-07, + "loss": 1.04, + "step": 3405 + }, + { + "epoch": 2.5778618732261114, + "grad_norm": 2.217832326889038, + "learning_rate": 9.852576421422033e-07, + "loss": 1.0943, + "step": 3406 + }, + { + "epoch": 2.5786187322611163, + "grad_norm": 2.0771234035491943, + "learning_rate": 9.817890342867157e-07, + "loss": 1.0767, + "step": 3407 + }, + { + "epoch": 2.579375591296121, + "grad_norm": 1.9438964128494263, + "learning_rate": 9.783262400770708e-07, + "loss": 1.1243, + "step": 3408 + }, + { + "epoch": 2.580132450331126, + "grad_norm": 2.441040277481079, + "learning_rate": 9.748692617453326e-07, + "loss": 1.0624, + "step": 3409 + }, + { + "epoch": 2.5808893093661305, + "grad_norm": 2.0702598094940186, + "learning_rate": 9.714181015198182e-07, + "loss": 1.0696, + "step": 3410 + }, + { + "epoch": 2.5816461684011354, + "grad_norm": 2.0918853282928467, + "learning_rate": 9.67972761625091e-07, + "loss": 1.0814, + "step": 3411 + }, + { + "epoch": 2.58240302743614, + "grad_norm": 1.9000964164733887, + "learning_rate": 9.645332442819653e-07, + "loss": 1.0554, + "step": 3412 + }, + { + "epoch": 2.5831598864711447, + "grad_norm": 2.1118955612182617, + "learning_rate": 9.610995517075005e-07, + "loss": 1.0622, + "step": 3413 + }, + { + "epoch": 2.5839167455061496, + "grad_norm": 2.136005163192749, + "learning_rate": 9.57671686115003e-07, + "loss": 1.0871, + "step": 3414 + }, + { + "epoch": 2.5846736045411545, + "grad_norm": 2.0861973762512207, + "learning_rate": 9.542496497140228e-07, + "loss": 1.0348, + "step": 3415 + }, + { + "epoch": 2.585430463576159, + "grad_norm": 1.9754106998443604, + "learning_rate": 9.50833444710354e-07, + "loss": 1.0797, + "step": 3416 + }, + { + "epoch": 2.586187322611164, + "grad_norm": 2.138561964035034, + "learning_rate": 9.474230733060293e-07, + "loss": 1.1018, + "step": 3417 + }, + { + "epoch": 2.5869441816461682, + "grad_norm": 2.1578221321105957, + "learning_rate": 9.440185376993193e-07, + "loss": 1.1082, + "step": 3418 + }, + { + "epoch": 2.587701040681173, + "grad_norm": 1.928044080734253, + "learning_rate": 9.406198400847376e-07, + "loss": 1.0723, + "step": 3419 + }, + { + "epoch": 2.588457899716178, + "grad_norm": 2.0299084186553955, + "learning_rate": 9.372269826530338e-07, + "loss": 1.0557, + "step": 3420 + }, + { + "epoch": 2.5892147587511825, + "grad_norm": 2.100691556930542, + "learning_rate": 9.338399675911917e-07, + "loss": 1.0221, + "step": 3421 + }, + { + "epoch": 2.5899716177861873, + "grad_norm": 2.015913724899292, + "learning_rate": 9.304587970824288e-07, + "loss": 1.0651, + "step": 3422 + }, + { + "epoch": 2.590728476821192, + "grad_norm": 1.8734519481658936, + "learning_rate": 9.270834733061999e-07, + "loss": 1.0554, + "step": 3423 + }, + { + "epoch": 2.5914853358561967, + "grad_norm": 1.9088720083236694, + "learning_rate": 9.237139984381806e-07, + "loss": 1.0847, + "step": 3424 + }, + { + "epoch": 2.5922421948912016, + "grad_norm": 2.083169460296631, + "learning_rate": 9.203503746502859e-07, + "loss": 1.0714, + "step": 3425 + }, + { + "epoch": 2.5929990539262064, + "grad_norm": 1.9658424854278564, + "learning_rate": 9.169926041106579e-07, + "loss": 1.0231, + "step": 3426 + }, + { + "epoch": 2.593755912961211, + "grad_norm": 1.955154299736023, + "learning_rate": 9.13640688983662e-07, + "loss": 1.0539, + "step": 3427 + }, + { + "epoch": 2.5945127719962158, + "grad_norm": 2.0878820419311523, + "learning_rate": 9.102946314298959e-07, + "loss": 1.0911, + "step": 3428 + }, + { + "epoch": 2.59526963103122, + "grad_norm": 1.9909857511520386, + "learning_rate": 9.069544336061716e-07, + "loss": 1.083, + "step": 3429 + }, + { + "epoch": 2.596026490066225, + "grad_norm": 2.0630910396575928, + "learning_rate": 9.036200976655337e-07, + "loss": 1.0977, + "step": 3430 + }, + { + "epoch": 2.59678334910123, + "grad_norm": 1.982391595840454, + "learning_rate": 9.002916257572411e-07, + "loss": 1.0271, + "step": 3431 + }, + { + "epoch": 2.597540208136235, + "grad_norm": 1.9987069368362427, + "learning_rate": 8.969690200267786e-07, + "loss": 1.0813, + "step": 3432 + }, + { + "epoch": 2.5982970671712393, + "grad_norm": 1.983818531036377, + "learning_rate": 8.936522826158452e-07, + "loss": 1.0776, + "step": 3433 + }, + { + "epoch": 2.599053926206244, + "grad_norm": 1.9349209070205688, + "learning_rate": 8.903414156623622e-07, + "loss": 1.0509, + "step": 3434 + }, + { + "epoch": 2.5998107852412486, + "grad_norm": 2.01790714263916, + "learning_rate": 8.870364213004612e-07, + "loss": 1.0949, + "step": 3435 + }, + { + "epoch": 2.6005676442762535, + "grad_norm": 2.0256693363189697, + "learning_rate": 8.837373016604916e-07, + "loss": 1.0572, + "step": 3436 + }, + { + "epoch": 2.6013245033112584, + "grad_norm": 1.989288568496704, + "learning_rate": 8.804440588690183e-07, + "loss": 1.0321, + "step": 3437 + }, + { + "epoch": 2.602081362346263, + "grad_norm": 2.1254732608795166, + "learning_rate": 8.771566950488107e-07, + "loss": 1.0513, + "step": 3438 + }, + { + "epoch": 2.6028382213812677, + "grad_norm": 2.4187963008880615, + "learning_rate": 8.738752123188587e-07, + "loss": 1.0755, + "step": 3439 + }, + { + "epoch": 2.6035950804162726, + "grad_norm": 2.0207037925720215, + "learning_rate": 8.705996127943503e-07, + "loss": 1.0669, + "step": 3440 + }, + { + "epoch": 2.604351939451277, + "grad_norm": 2.1482834815979004, + "learning_rate": 8.6732989858669e-07, + "loss": 1.0675, + "step": 3441 + }, + { + "epoch": 2.605108798486282, + "grad_norm": 1.9809141159057617, + "learning_rate": 8.640660718034855e-07, + "loss": 1.1257, + "step": 3442 + }, + { + "epoch": 2.605865657521287, + "grad_norm": 2.1395535469055176, + "learning_rate": 8.608081345485507e-07, + "loss": 1.1311, + "step": 3443 + }, + { + "epoch": 2.6066225165562913, + "grad_norm": 2.1757044792175293, + "learning_rate": 8.575560889219027e-07, + "loss": 1.0888, + "step": 3444 + }, + { + "epoch": 2.607379375591296, + "grad_norm": 1.8578970432281494, + "learning_rate": 8.543099370197591e-07, + "loss": 1.0527, + "step": 3445 + }, + { + "epoch": 2.6081362346263006, + "grad_norm": 1.966065526008606, + "learning_rate": 8.51069680934539e-07, + "loss": 1.0301, + "step": 3446 + }, + { + "epoch": 2.6088930936613055, + "grad_norm": 1.9428819417953491, + "learning_rate": 8.478353227548625e-07, + "loss": 1.0255, + "step": 3447 + }, + { + "epoch": 2.6096499526963104, + "grad_norm": 1.873252272605896, + "learning_rate": 8.446068645655477e-07, + "loss": 1.092, + "step": 3448 + }, + { + "epoch": 2.6104068117313153, + "grad_norm": 2.0765583515167236, + "learning_rate": 8.413843084476109e-07, + "loss": 1.0985, + "step": 3449 + }, + { + "epoch": 2.6111636707663197, + "grad_norm": 2.0844457149505615, + "learning_rate": 8.381676564782655e-07, + "loss": 1.1138, + "step": 3450 + }, + { + "epoch": 2.6119205298013246, + "grad_norm": 1.9434021711349487, + "learning_rate": 8.349569107309078e-07, + "loss": 1.059, + "step": 3451 + }, + { + "epoch": 2.612677388836329, + "grad_norm": 2.0002236366271973, + "learning_rate": 8.317520732751409e-07, + "loss": 1.0701, + "step": 3452 + }, + { + "epoch": 2.613434247871334, + "grad_norm": 2.0566999912261963, + "learning_rate": 8.285531461767541e-07, + "loss": 1.066, + "step": 3453 + }, + { + "epoch": 2.614191106906339, + "grad_norm": 1.926048994064331, + "learning_rate": 8.253601314977264e-07, + "loss": 1.0811, + "step": 3454 + }, + { + "epoch": 2.6149479659413433, + "grad_norm": 2.086893320083618, + "learning_rate": 8.22173031296225e-07, + "loss": 1.0647, + "step": 3455 + }, + { + "epoch": 2.615704824976348, + "grad_norm": 2.1144938468933105, + "learning_rate": 8.189918476266104e-07, + "loss": 1.0737, + "step": 3456 + }, + { + "epoch": 2.616461684011353, + "grad_norm": 1.8913697004318237, + "learning_rate": 8.158165825394222e-07, + "loss": 1.0621, + "step": 3457 + }, + { + "epoch": 2.6172185430463575, + "grad_norm": 1.850129246711731, + "learning_rate": 8.126472380813851e-07, + "loss": 1.0475, + "step": 3458 + }, + { + "epoch": 2.6179754020813624, + "grad_norm": 1.9463554620742798, + "learning_rate": 8.094838162954142e-07, + "loss": 1.0652, + "step": 3459 + }, + { + "epoch": 2.6187322611163673, + "grad_norm": 1.9536323547363281, + "learning_rate": 8.063263192206013e-07, + "loss": 1.0567, + "step": 3460 + }, + { + "epoch": 2.6194891201513717, + "grad_norm": 1.9918063879013062, + "learning_rate": 8.031747488922231e-07, + "loss": 1.0604, + "step": 3461 + }, + { + "epoch": 2.6202459791863766, + "grad_norm": 2.074878215789795, + "learning_rate": 8.0002910734173e-07, + "loss": 1.0467, + "step": 3462 + }, + { + "epoch": 2.621002838221381, + "grad_norm": 2.0185697078704834, + "learning_rate": 7.968893965967558e-07, + "loss": 1.1014, + "step": 3463 + }, + { + "epoch": 2.621759697256386, + "grad_norm": 1.9658541679382324, + "learning_rate": 7.937556186811127e-07, + "loss": 1.0582, + "step": 3464 + }, + { + "epoch": 2.622516556291391, + "grad_norm": 2.0424704551696777, + "learning_rate": 7.906277756147835e-07, + "loss": 1.0843, + "step": 3465 + }, + { + "epoch": 2.6232734153263957, + "grad_norm": 2.1158058643341064, + "learning_rate": 7.875058694139282e-07, + "loss": 1.0359, + "step": 3466 + }, + { + "epoch": 2.6240302743614, + "grad_norm": 2.1120176315307617, + "learning_rate": 7.843899020908823e-07, + "loss": 1.0796, + "step": 3467 + }, + { + "epoch": 2.624787133396405, + "grad_norm": 1.910689353942871, + "learning_rate": 7.812798756541483e-07, + "loss": 1.0547, + "step": 3468 + }, + { + "epoch": 2.6255439924314095, + "grad_norm": 2.1395435333251953, + "learning_rate": 7.781757921084019e-07, + "loss": 1.0968, + "step": 3469 + }, + { + "epoch": 2.6263008514664143, + "grad_norm": 1.9301244020462036, + "learning_rate": 7.750776534544889e-07, + "loss": 1.0543, + "step": 3470 + }, + { + "epoch": 2.6270577105014192, + "grad_norm": 1.9667290449142456, + "learning_rate": 7.719854616894243e-07, + "loss": 1.0575, + "step": 3471 + }, + { + "epoch": 2.627814569536424, + "grad_norm": 1.8904736042022705, + "learning_rate": 7.688992188063853e-07, + "loss": 1.0781, + "step": 3472 + }, + { + "epoch": 2.6285714285714286, + "grad_norm": 1.991716980934143, + "learning_rate": 7.658189267947159e-07, + "loss": 1.052, + "step": 3473 + }, + { + "epoch": 2.6293282876064334, + "grad_norm": 2.0166666507720947, + "learning_rate": 7.627445876399259e-07, + "loss": 1.0668, + "step": 3474 + }, + { + "epoch": 2.630085146641438, + "grad_norm": 2.020129919052124, + "learning_rate": 7.596762033236895e-07, + "loss": 1.0539, + "step": 3475 + }, + { + "epoch": 2.6308420056764428, + "grad_norm": 2.1295621395111084, + "learning_rate": 7.566137758238386e-07, + "loss": 1.0264, + "step": 3476 + }, + { + "epoch": 2.6315988647114477, + "grad_norm": 2.1414082050323486, + "learning_rate": 7.53557307114367e-07, + "loss": 1.0476, + "step": 3477 + }, + { + "epoch": 2.632355723746452, + "grad_norm": 1.8784303665161133, + "learning_rate": 7.505067991654335e-07, + "loss": 1.0252, + "step": 3478 + }, + { + "epoch": 2.633112582781457, + "grad_norm": 2.0736515522003174, + "learning_rate": 7.474622539433398e-07, + "loss": 1.0661, + "step": 3479 + }, + { + "epoch": 2.6338694418164614, + "grad_norm": 1.9620708227157593, + "learning_rate": 7.444236734105581e-07, + "loss": 1.0835, + "step": 3480 + }, + { + "epoch": 2.6346263008514663, + "grad_norm": 2.1406285762786865, + "learning_rate": 7.413910595257105e-07, + "loss": 1.0577, + "step": 3481 + }, + { + "epoch": 2.635383159886471, + "grad_norm": 1.9883054494857788, + "learning_rate": 7.383644142435741e-07, + "loss": 1.0546, + "step": 3482 + }, + { + "epoch": 2.636140018921476, + "grad_norm": 1.9472057819366455, + "learning_rate": 7.353437395150799e-07, + "loss": 1.0452, + "step": 3483 + }, + { + "epoch": 2.6368968779564805, + "grad_norm": 2.0464439392089844, + "learning_rate": 7.323290372873055e-07, + "loss": 1.1029, + "step": 3484 + }, + { + "epoch": 2.6376537369914854, + "grad_norm": 1.9990071058273315, + "learning_rate": 7.293203095034839e-07, + "loss": 1.1126, + "step": 3485 + }, + { + "epoch": 2.63841059602649, + "grad_norm": 2.022820234298706, + "learning_rate": 7.263175581029933e-07, + "loss": 1.0625, + "step": 3486 + }, + { + "epoch": 2.6391674550614947, + "grad_norm": 1.835789442062378, + "learning_rate": 7.233207850213639e-07, + "loss": 1.0732, + "step": 3487 + }, + { + "epoch": 2.6399243140964996, + "grad_norm": 1.9170242547988892, + "learning_rate": 7.20329992190268e-07, + "loss": 1.1162, + "step": 3488 + }, + { + "epoch": 2.6406811731315045, + "grad_norm": 1.9878673553466797, + "learning_rate": 7.173451815375276e-07, + "loss": 1.0664, + "step": 3489 + }, + { + "epoch": 2.641438032166509, + "grad_norm": 2.3943591117858887, + "learning_rate": 7.14366354987102e-07, + "loss": 1.1248, + "step": 3490 + }, + { + "epoch": 2.642194891201514, + "grad_norm": 1.926537275314331, + "learning_rate": 7.113935144591011e-07, + "loss": 1.0654, + "step": 3491 + }, + { + "epoch": 2.6429517502365183, + "grad_norm": 2.132347583770752, + "learning_rate": 7.084266618697722e-07, + "loss": 1.058, + "step": 3492 + }, + { + "epoch": 2.643708609271523, + "grad_norm": 2.0414459705352783, + "learning_rate": 7.054657991315009e-07, + "loss": 1.0699, + "step": 3493 + }, + { + "epoch": 2.644465468306528, + "grad_norm": 2.3885207176208496, + "learning_rate": 7.025109281528162e-07, + "loss": 1.0914, + "step": 3494 + }, + { + "epoch": 2.6452223273415325, + "grad_norm": 2.1995675563812256, + "learning_rate": 6.995620508383816e-07, + "loss": 1.0691, + "step": 3495 + }, + { + "epoch": 2.6459791863765374, + "grad_norm": 1.9995527267456055, + "learning_rate": 6.966191690889987e-07, + "loss": 1.0715, + "step": 3496 + }, + { + "epoch": 2.646736045411542, + "grad_norm": 2.06939435005188, + "learning_rate": 6.936822848016048e-07, + "loss": 1.0522, + "step": 3497 + }, + { + "epoch": 2.6474929044465467, + "grad_norm": 2.001063346862793, + "learning_rate": 6.907513998692701e-07, + "loss": 1.0726, + "step": 3498 + }, + { + "epoch": 2.6482497634815516, + "grad_norm": 2.1571309566497803, + "learning_rate": 6.878265161812005e-07, + "loss": 1.0609, + "step": 3499 + }, + { + "epoch": 2.6490066225165565, + "grad_norm": 2.0034592151641846, + "learning_rate": 6.849076356227285e-07, + "loss": 1.0636, + "step": 3500 + }, + { + "epoch": 2.649763481551561, + "grad_norm": 1.8944875001907349, + "learning_rate": 6.819947600753214e-07, + "loss": 1.061, + "step": 3501 + }, + { + "epoch": 2.650520340586566, + "grad_norm": 1.9522629976272583, + "learning_rate": 6.790878914165723e-07, + "loss": 1.1064, + "step": 3502 + }, + { + "epoch": 2.6512771996215703, + "grad_norm": 1.9700335264205933, + "learning_rate": 6.761870315202072e-07, + "loss": 1.0509, + "step": 3503 + }, + { + "epoch": 2.652034058656575, + "grad_norm": 1.9173399209976196, + "learning_rate": 6.732921822560753e-07, + "loss": 1.0467, + "step": 3504 + }, + { + "epoch": 2.65279091769158, + "grad_norm": 2.1325631141662598, + "learning_rate": 6.704033454901552e-07, + "loss": 1.0547, + "step": 3505 + }, + { + "epoch": 2.653547776726585, + "grad_norm": 1.9540364742279053, + "learning_rate": 6.67520523084541e-07, + "loss": 1.1084, + "step": 3506 + }, + { + "epoch": 2.6543046357615894, + "grad_norm": 2.026878595352173, + "learning_rate": 6.646437168974577e-07, + "loss": 1.0511, + "step": 3507 + }, + { + "epoch": 2.6550614947965943, + "grad_norm": 2.238311290740967, + "learning_rate": 6.617729287832535e-07, + "loss": 1.1053, + "step": 3508 + }, + { + "epoch": 2.6558183538315987, + "grad_norm": 2.0573630332946777, + "learning_rate": 6.589081605923916e-07, + "loss": 1.0377, + "step": 3509 + }, + { + "epoch": 2.6565752128666036, + "grad_norm": 2.0835254192352295, + "learning_rate": 6.56049414171461e-07, + "loss": 1.0403, + "step": 3510 + }, + { + "epoch": 2.6573320719016085, + "grad_norm": 1.888987421989441, + "learning_rate": 6.531966913631649e-07, + "loss": 1.0874, + "step": 3511 + }, + { + "epoch": 2.658088930936613, + "grad_norm": 1.9704266786575317, + "learning_rate": 6.503499940063245e-07, + "loss": 1.0968, + "step": 3512 + }, + { + "epoch": 2.658845789971618, + "grad_norm": 2.062167167663574, + "learning_rate": 6.475093239358764e-07, + "loss": 1.074, + "step": 3513 + }, + { + "epoch": 2.6596026490066222, + "grad_norm": 2.041229724884033, + "learning_rate": 6.446746829828747e-07, + "loss": 1.0881, + "step": 3514 + }, + { + "epoch": 2.660359508041627, + "grad_norm": 2.0432045459747314, + "learning_rate": 6.41846072974484e-07, + "loss": 1.0497, + "step": 3515 + }, + { + "epoch": 2.661116367076632, + "grad_norm": 2.037137746810913, + "learning_rate": 6.390234957339877e-07, + "loss": 1.1183, + "step": 3516 + }, + { + "epoch": 2.661873226111637, + "grad_norm": 2.0373597145080566, + "learning_rate": 6.362069530807692e-07, + "loss": 1.0721, + "step": 3517 + }, + { + "epoch": 2.6626300851466413, + "grad_norm": 2.2422561645507812, + "learning_rate": 6.333964468303339e-07, + "loss": 1.1072, + "step": 3518 + }, + { + "epoch": 2.6633869441816462, + "grad_norm": 2.035428285598755, + "learning_rate": 6.305919787942921e-07, + "loss": 1.0755, + "step": 3519 + }, + { + "epoch": 2.6641438032166507, + "grad_norm": 2.0684683322906494, + "learning_rate": 6.277935507803559e-07, + "loss": 1.0673, + "step": 3520 + }, + { + "epoch": 2.6649006622516556, + "grad_norm": 2.1325268745422363, + "learning_rate": 6.25001164592354e-07, + "loss": 1.0269, + "step": 3521 + }, + { + "epoch": 2.6656575212866604, + "grad_norm": 1.9109140634536743, + "learning_rate": 6.222148220302141e-07, + "loss": 1.0445, + "step": 3522 + }, + { + "epoch": 2.6664143803216653, + "grad_norm": 2.0545241832733154, + "learning_rate": 6.1943452488997e-07, + "loss": 1.03, + "step": 3523 + }, + { + "epoch": 2.6671712393566698, + "grad_norm": 1.9249234199523926, + "learning_rate": 6.166602749637587e-07, + "loss": 1.0247, + "step": 3524 + }, + { + "epoch": 2.6679280983916747, + "grad_norm": 2.097187042236328, + "learning_rate": 6.138920740398207e-07, + "loss": 1.0982, + "step": 3525 + }, + { + "epoch": 2.668684957426679, + "grad_norm": 2.3547725677490234, + "learning_rate": 6.111299239024957e-07, + "loss": 1.0561, + "step": 3526 + }, + { + "epoch": 2.669441816461684, + "grad_norm": 1.9423364400863647, + "learning_rate": 6.083738263322244e-07, + "loss": 1.0832, + "step": 3527 + }, + { + "epoch": 2.670198675496689, + "grad_norm": 1.9852162599563599, + "learning_rate": 6.056237831055416e-07, + "loss": 1.0746, + "step": 3528 + }, + { + "epoch": 2.6709555345316933, + "grad_norm": 2.095628261566162, + "learning_rate": 6.02879795995085e-07, + "loss": 1.0651, + "step": 3529 + }, + { + "epoch": 2.671712393566698, + "grad_norm": 2.109067916870117, + "learning_rate": 6.001418667695884e-07, + "loss": 1.0847, + "step": 3530 + }, + { + "epoch": 2.672469252601703, + "grad_norm": 1.9949407577514648, + "learning_rate": 5.97409997193879e-07, + "loss": 1.1257, + "step": 3531 + }, + { + "epoch": 2.6732261116367075, + "grad_norm": 2.216343641281128, + "learning_rate": 5.946841890288763e-07, + "loss": 1.0922, + "step": 3532 + }, + { + "epoch": 2.6739829706717124, + "grad_norm": 1.9918749332427979, + "learning_rate": 5.91964444031599e-07, + "loss": 1.0832, + "step": 3533 + }, + { + "epoch": 2.6747398297067173, + "grad_norm": 2.0035977363586426, + "learning_rate": 5.892507639551483e-07, + "loss": 1.1023, + "step": 3534 + }, + { + "epoch": 2.6754966887417218, + "grad_norm": 1.9594407081604004, + "learning_rate": 5.86543150548722e-07, + "loss": 1.0479, + "step": 3535 + }, + { + "epoch": 2.6762535477767266, + "grad_norm": 1.9996135234832764, + "learning_rate": 5.838416055576072e-07, + "loss": 1.0837, + "step": 3536 + }, + { + "epoch": 2.677010406811731, + "grad_norm": 2.032686710357666, + "learning_rate": 5.811461307231798e-07, + "loss": 1.0673, + "step": 3537 + }, + { + "epoch": 2.677767265846736, + "grad_norm": 1.7957963943481445, + "learning_rate": 5.784567277829007e-07, + "loss": 1.0672, + "step": 3538 + }, + { + "epoch": 2.678524124881741, + "grad_norm": 1.936874508857727, + "learning_rate": 5.757733984703174e-07, + "loss": 1.1329, + "step": 3539 + }, + { + "epoch": 2.6792809839167457, + "grad_norm": 2.072567939758301, + "learning_rate": 5.730961445150644e-07, + "loss": 1.1066, + "step": 3540 + }, + { + "epoch": 2.68003784295175, + "grad_norm": 1.8656892776489258, + "learning_rate": 5.704249676428575e-07, + "loss": 1.1158, + "step": 3541 + }, + { + "epoch": 2.680794701986755, + "grad_norm": 1.9235533475875854, + "learning_rate": 5.677598695754967e-07, + "loss": 1.0364, + "step": 3542 + }, + { + "epoch": 2.6815515610217595, + "grad_norm": 2.159919261932373, + "learning_rate": 5.651008520308641e-07, + "loss": 1.0813, + "step": 3543 + }, + { + "epoch": 2.6823084200567644, + "grad_norm": 2.093416929244995, + "learning_rate": 5.624479167229225e-07, + "loss": 1.082, + "step": 3544 + }, + { + "epoch": 2.6830652790917693, + "grad_norm": 1.9927400350570679, + "learning_rate": 5.598010653617116e-07, + "loss": 1.0486, + "step": 3545 + }, + { + "epoch": 2.6838221381267737, + "grad_norm": 1.986259937286377, + "learning_rate": 5.571602996533528e-07, + "loss": 1.0532, + "step": 3546 + }, + { + "epoch": 2.6845789971617786, + "grad_norm": 2.1325762271881104, + "learning_rate": 5.54525621300045e-07, + "loss": 1.1321, + "step": 3547 + }, + { + "epoch": 2.6853358561967835, + "grad_norm": 1.9752742052078247, + "learning_rate": 5.518970320000578e-07, + "loss": 1.0752, + "step": 3548 + }, + { + "epoch": 2.686092715231788, + "grad_norm": 1.9965808391571045, + "learning_rate": 5.492745334477438e-07, + "loss": 1.0721, + "step": 3549 + }, + { + "epoch": 2.686849574266793, + "grad_norm": 2.021066427230835, + "learning_rate": 5.466581273335216e-07, + "loss": 1.0819, + "step": 3550 + }, + { + "epoch": 2.6876064333017977, + "grad_norm": 2.0489556789398193, + "learning_rate": 5.440478153438891e-07, + "loss": 1.0542, + "step": 3551 + }, + { + "epoch": 2.688363292336802, + "grad_norm": 2.1207127571105957, + "learning_rate": 5.414435991614129e-07, + "loss": 1.0577, + "step": 3552 + }, + { + "epoch": 2.689120151371807, + "grad_norm": 2.099400281906128, + "learning_rate": 5.388454804647312e-07, + "loss": 1.0527, + "step": 3553 + }, + { + "epoch": 2.6898770104068115, + "grad_norm": 2.165239095687866, + "learning_rate": 5.362534609285534e-07, + "loss": 1.0728, + "step": 3554 + }, + { + "epoch": 2.6906338694418164, + "grad_norm": 1.964612364768982, + "learning_rate": 5.336675422236547e-07, + "loss": 1.1085, + "step": 3555 + }, + { + "epoch": 2.6913907284768213, + "grad_norm": 2.3169875144958496, + "learning_rate": 5.31087726016876e-07, + "loss": 1.0873, + "step": 3556 + }, + { + "epoch": 2.692147587511826, + "grad_norm": 2.005558490753174, + "learning_rate": 5.285140139711306e-07, + "loss": 1.0435, + "step": 3557 + }, + { + "epoch": 2.6929044465468306, + "grad_norm": 1.9185731410980225, + "learning_rate": 5.259464077453933e-07, + "loss": 1.1144, + "step": 3558 + }, + { + "epoch": 2.6936613055818355, + "grad_norm": 1.933445930480957, + "learning_rate": 5.233849089947034e-07, + "loss": 1.0526, + "step": 3559 + }, + { + "epoch": 2.69441816461684, + "grad_norm": 2.1504805088043213, + "learning_rate": 5.208295193701673e-07, + "loss": 1.0822, + "step": 3560 + }, + { + "epoch": 2.695175023651845, + "grad_norm": 2.1270816326141357, + "learning_rate": 5.182802405189443e-07, + "loss": 1.0848, + "step": 3561 + }, + { + "epoch": 2.6959318826868497, + "grad_norm": 2.112243890762329, + "learning_rate": 5.157370740842649e-07, + "loss": 1.0501, + "step": 3562 + }, + { + "epoch": 2.6966887417218546, + "grad_norm": 2.2307591438293457, + "learning_rate": 5.132000217054134e-07, + "loss": 1.1388, + "step": 3563 + }, + { + "epoch": 2.697445600756859, + "grad_norm": 2.053459405899048, + "learning_rate": 5.106690850177358e-07, + "loss": 1.0846, + "step": 3564 + }, + { + "epoch": 2.698202459791864, + "grad_norm": 2.0699667930603027, + "learning_rate": 5.08144265652635e-07, + "loss": 1.0567, + "step": 3565 + }, + { + "epoch": 2.6989593188268683, + "grad_norm": 2.0828826427459717, + "learning_rate": 5.056255652375729e-07, + "loss": 1.0729, + "step": 3566 + }, + { + "epoch": 2.6997161778618732, + "grad_norm": 1.9452773332595825, + "learning_rate": 5.031129853960639e-07, + "loss": 1.0788, + "step": 3567 + }, + { + "epoch": 2.700473036896878, + "grad_norm": 2.0344362258911133, + "learning_rate": 5.006065277476771e-07, + "loss": 1.054, + "step": 3568 + }, + { + "epoch": 2.7012298959318826, + "grad_norm": 1.900039792060852, + "learning_rate": 4.981061939080384e-07, + "loss": 1.0262, + "step": 3569 + }, + { + "epoch": 2.7019867549668874, + "grad_norm": 1.9985625743865967, + "learning_rate": 4.956119854888261e-07, + "loss": 1.0899, + "step": 3570 + }, + { + "epoch": 2.702743614001892, + "grad_norm": 2.088229179382324, + "learning_rate": 4.931239040977678e-07, + "loss": 1.1423, + "step": 3571 + }, + { + "epoch": 2.703500473036897, + "grad_norm": 1.9471749067306519, + "learning_rate": 4.90641951338641e-07, + "loss": 1.0762, + "step": 3572 + }, + { + "epoch": 2.7042573320719017, + "grad_norm": 1.8323947191238403, + "learning_rate": 4.88166128811277e-07, + "loss": 1.0531, + "step": 3573 + }, + { + "epoch": 2.7050141911069066, + "grad_norm": 1.9754245281219482, + "learning_rate": 4.856964381115542e-07, + "loss": 1.1185, + "step": 3574 + }, + { + "epoch": 2.705771050141911, + "grad_norm": 1.9460619688034058, + "learning_rate": 4.83232880831394e-07, + "loss": 1.1218, + "step": 3575 + }, + { + "epoch": 2.706527909176916, + "grad_norm": 2.2051377296447754, + "learning_rate": 4.807754585587696e-07, + "loss": 1.0773, + "step": 3576 + }, + { + "epoch": 2.7072847682119203, + "grad_norm": 1.9983853101730347, + "learning_rate": 4.783241728776997e-07, + "loss": 1.0325, + "step": 3577 + }, + { + "epoch": 2.708041627246925, + "grad_norm": 1.9599753618240356, + "learning_rate": 4.7587902536824234e-07, + "loss": 1.0701, + "step": 3578 + }, + { + "epoch": 2.70879848628193, + "grad_norm": 2.0052897930145264, + "learning_rate": 4.7344001760650454e-07, + "loss": 1.0672, + "step": 3579 + }, + { + "epoch": 2.709555345316935, + "grad_norm": 3.11828351020813, + "learning_rate": 4.710071511646324e-07, + "loss": 1.0932, + "step": 3580 + }, + { + "epoch": 2.7103122043519394, + "grad_norm": 2.1355981826782227, + "learning_rate": 4.685804276108169e-07, + "loss": 1.1196, + "step": 3581 + }, + { + "epoch": 2.7110690633869443, + "grad_norm": 2.2099850177764893, + "learning_rate": 4.6615984850928456e-07, + "loss": 1.1028, + "step": 3582 + }, + { + "epoch": 2.7118259224219488, + "grad_norm": 1.9474663734436035, + "learning_rate": 4.637454154203033e-07, + "loss": 1.07, + "step": 3583 + }, + { + "epoch": 2.7125827814569536, + "grad_norm": 2.1069188117980957, + "learning_rate": 4.613371299001815e-07, + "loss": 1.0899, + "step": 3584 + }, + { + "epoch": 2.7133396404919585, + "grad_norm": 2.008517265319824, + "learning_rate": 4.58934993501263e-07, + "loss": 1.0885, + "step": 3585 + }, + { + "epoch": 2.714096499526963, + "grad_norm": 1.88406503200531, + "learning_rate": 4.5653900777192763e-07, + "loss": 1.0659, + "step": 3586 + }, + { + "epoch": 2.714853358561968, + "grad_norm": 2.1920740604400635, + "learning_rate": 4.5414917425659094e-07, + "loss": 1.1038, + "step": 3587 + }, + { + "epoch": 2.7156102175969723, + "grad_norm": 2.047375440597534, + "learning_rate": 4.5176549449570765e-07, + "loss": 1.0542, + "step": 3588 + }, + { + "epoch": 2.716367076631977, + "grad_norm": 1.9768850803375244, + "learning_rate": 4.4938797002575485e-07, + "loss": 1.0511, + "step": 3589 + }, + { + "epoch": 2.717123935666982, + "grad_norm": 2.005725145339966, + "learning_rate": 4.4701660237925116e-07, + "loss": 1.0859, + "step": 3590 + }, + { + "epoch": 2.717880794701987, + "grad_norm": 2.0299482345581055, + "learning_rate": 4.446513930847431e-07, + "loss": 1.052, + "step": 3591 + }, + { + "epoch": 2.7186376537369914, + "grad_norm": 2.217197895050049, + "learning_rate": 4.4229234366681054e-07, + "loss": 1.083, + "step": 3592 + }, + { + "epoch": 2.7193945127719963, + "grad_norm": 2.225231170654297, + "learning_rate": 4.399394556460618e-07, + "loss": 1.1048, + "step": 3593 + }, + { + "epoch": 2.7201513718070007, + "grad_norm": 2.035879135131836, + "learning_rate": 4.375927305391286e-07, + "loss": 1.0064, + "step": 3594 + }, + { + "epoch": 2.7209082308420056, + "grad_norm": 2.046074628829956, + "learning_rate": 4.352521698586783e-07, + "loss": 1.0649, + "step": 3595 + }, + { + "epoch": 2.7216650898770105, + "grad_norm": 2.068490743637085, + "learning_rate": 4.329177751133964e-07, + "loss": 1.0509, + "step": 3596 + }, + { + "epoch": 2.7224219489120154, + "grad_norm": 2.191215991973877, + "learning_rate": 4.305895478079998e-07, + "loss": 1.0413, + "step": 3597 + }, + { + "epoch": 2.72317880794702, + "grad_norm": 2.310241937637329, + "learning_rate": 4.2826748944323e-07, + "loss": 1.0864, + "step": 3598 + }, + { + "epoch": 2.7239356669820247, + "grad_norm": 2.196274757385254, + "learning_rate": 4.2595160151584996e-07, + "loss": 1.0302, + "step": 3599 + }, + { + "epoch": 2.724692526017029, + "grad_norm": 2.0941972732543945, + "learning_rate": 4.2364188551864284e-07, + "loss": 1.0968, + "step": 3600 + }, + { + "epoch": 2.725449385052034, + "grad_norm": 2.1524224281311035, + "learning_rate": 4.213383429404197e-07, + "loss": 1.0739, + "step": 3601 + }, + { + "epoch": 2.726206244087039, + "grad_norm": 2.400557518005371, + "learning_rate": 4.190409752660077e-07, + "loss": 1.1176, + "step": 3602 + }, + { + "epoch": 2.7269631031220434, + "grad_norm": 2.0198590755462646, + "learning_rate": 4.16749783976255e-07, + "loss": 1.0351, + "step": 3603 + }, + { + "epoch": 2.7277199621570483, + "grad_norm": 2.151195526123047, + "learning_rate": 4.144647705480291e-07, + "loss": 1.0867, + "step": 3604 + }, + { + "epoch": 2.7284768211920527, + "grad_norm": 1.927239179611206, + "learning_rate": 4.1218593645421344e-07, + "loss": 1.0605, + "step": 3605 + }, + { + "epoch": 2.7292336802270576, + "grad_norm": 2.175260066986084, + "learning_rate": 4.099132831637103e-07, + "loss": 1.0312, + "step": 3606 + }, + { + "epoch": 2.7299905392620625, + "grad_norm": 2.2161762714385986, + "learning_rate": 4.0764681214143794e-07, + "loss": 1.0217, + "step": 3607 + }, + { + "epoch": 2.7307473982970674, + "grad_norm": 2.06466007232666, + "learning_rate": 4.053865248483281e-07, + "loss": 1.0851, + "step": 3608 + }, + { + "epoch": 2.731504257332072, + "grad_norm": 2.1965982913970947, + "learning_rate": 4.031324227413297e-07, + "loss": 1.0758, + "step": 3609 + }, + { + "epoch": 2.7322611163670767, + "grad_norm": 2.31892728805542, + "learning_rate": 4.008845072734016e-07, + "loss": 1.1159, + "step": 3610 + }, + { + "epoch": 2.733017975402081, + "grad_norm": 2.0228688716888428, + "learning_rate": 3.986427798935131e-07, + "loss": 1.0769, + "step": 3611 + }, + { + "epoch": 2.733774834437086, + "grad_norm": 2.0157992839813232, + "learning_rate": 3.964072420466503e-07, + "loss": 1.0597, + "step": 3612 + }, + { + "epoch": 2.734531693472091, + "grad_norm": 1.9818907976150513, + "learning_rate": 3.9417789517380527e-07, + "loss": 1.0732, + "step": 3613 + }, + { + "epoch": 2.735288552507096, + "grad_norm": 2.1533520221710205, + "learning_rate": 3.919547407119824e-07, + "loss": 1.063, + "step": 3614 + }, + { + "epoch": 2.7360454115421002, + "grad_norm": 2.073683738708496, + "learning_rate": 3.897377800941943e-07, + "loss": 1.0551, + "step": 3615 + }, + { + "epoch": 2.736802270577105, + "grad_norm": 2.021272897720337, + "learning_rate": 3.875270147494558e-07, + "loss": 1.1027, + "step": 3616 + }, + { + "epoch": 2.7375591296121096, + "grad_norm": 2.308957099914551, + "learning_rate": 3.853224461027956e-07, + "loss": 1.076, + "step": 3617 + }, + { + "epoch": 2.7383159886471145, + "grad_norm": 2.239806652069092, + "learning_rate": 3.8312407557524466e-07, + "loss": 1.0998, + "step": 3618 + }, + { + "epoch": 2.7390728476821193, + "grad_norm": 2.1331143379211426, + "learning_rate": 3.8093190458383777e-07, + "loss": 1.1259, + "step": 3619 + }, + { + "epoch": 2.739829706717124, + "grad_norm": 2.0615665912628174, + "learning_rate": 3.7874593454161647e-07, + "loss": 1.0191, + "step": 3620 + }, + { + "epoch": 2.7405865657521287, + "grad_norm": 1.9834305047988892, + "learning_rate": 3.7656616685762473e-07, + "loss": 1.0553, + "step": 3621 + }, + { + "epoch": 2.7413434247871336, + "grad_norm": 2.1964480876922607, + "learning_rate": 3.7439260293690597e-07, + "loss": 1.0388, + "step": 3622 + }, + { + "epoch": 2.742100283822138, + "grad_norm": 1.9631261825561523, + "learning_rate": 3.722252441805057e-07, + "loss": 1.0515, + "step": 3623 + }, + { + "epoch": 2.742857142857143, + "grad_norm": 2.0425281524658203, + "learning_rate": 3.7006409198547004e-07, + "loss": 1.0117, + "step": 3624 + }, + { + "epoch": 2.7436140018921478, + "grad_norm": 2.2030279636383057, + "learning_rate": 3.6790914774484625e-07, + "loss": 1.0627, + "step": 3625 + }, + { + "epoch": 2.744370860927152, + "grad_norm": 2.0677294731140137, + "learning_rate": 3.6576041284767873e-07, + "loss": 1.0504, + "step": 3626 + }, + { + "epoch": 2.745127719962157, + "grad_norm": 1.951145887374878, + "learning_rate": 3.6361788867900865e-07, + "loss": 1.0804, + "step": 3627 + }, + { + "epoch": 2.7458845789971615, + "grad_norm": 2.083099126815796, + "learning_rate": 3.614815766198731e-07, + "loss": 1.0902, + "step": 3628 + }, + { + "epoch": 2.7466414380321664, + "grad_norm": 2.0568675994873047, + "learning_rate": 3.593514780473093e-07, + "loss": 1.0659, + "step": 3629 + }, + { + "epoch": 2.7473982970671713, + "grad_norm": 2.0525779724121094, + "learning_rate": 3.572275943343428e-07, + "loss": 1.0885, + "step": 3630 + }, + { + "epoch": 2.748155156102176, + "grad_norm": 2.105832576751709, + "learning_rate": 3.55109926849998e-07, + "loss": 1.0754, + "step": 3631 + }, + { + "epoch": 2.7489120151371806, + "grad_norm": 2.1376736164093018, + "learning_rate": 3.5299847695929306e-07, + "loss": 1.1257, + "step": 3632 + }, + { + "epoch": 2.7496688741721855, + "grad_norm": 1.8944578170776367, + "learning_rate": 3.508932460232331e-07, + "loss": 1.0465, + "step": 3633 + }, + { + "epoch": 2.75042573320719, + "grad_norm": 2.0414884090423584, + "learning_rate": 3.4879423539882017e-07, + "loss": 1.0621, + "step": 3634 + }, + { + "epoch": 2.751182592242195, + "grad_norm": 2.1808700561523438, + "learning_rate": 3.467014464390431e-07, + "loss": 1.0513, + "step": 3635 + }, + { + "epoch": 2.7519394512771997, + "grad_norm": 2.060415506362915, + "learning_rate": 3.446148804928836e-07, + "loss": 1.0927, + "step": 3636 + }, + { + "epoch": 2.752696310312204, + "grad_norm": 1.9510079622268677, + "learning_rate": 3.425345389053098e-07, + "loss": 1.0684, + "step": 3637 + }, + { + "epoch": 2.753453169347209, + "grad_norm": 1.9349720478057861, + "learning_rate": 3.4046042301727504e-07, + "loss": 1.0437, + "step": 3638 + }, + { + "epoch": 2.754210028382214, + "grad_norm": 1.9200588464736938, + "learning_rate": 3.383925341657259e-07, + "loss": 1.0417, + "step": 3639 + }, + { + "epoch": 2.7549668874172184, + "grad_norm": 1.9135462045669556, + "learning_rate": 3.363308736835918e-07, + "loss": 1.0593, + "step": 3640 + }, + { + "epoch": 2.7557237464522233, + "grad_norm": 2.030207633972168, + "learning_rate": 3.342754428997865e-07, + "loss": 1.0311, + "step": 3641 + }, + { + "epoch": 2.756480605487228, + "grad_norm": 2.0563161373138428, + "learning_rate": 3.3222624313920995e-07, + "loss": 1.1101, + "step": 3642 + }, + { + "epoch": 2.7572374645222326, + "grad_norm": 2.1681125164031982, + "learning_rate": 3.301832757227478e-07, + "loss": 1.1007, + "step": 3643 + }, + { + "epoch": 2.7579943235572375, + "grad_norm": 1.9184566736221313, + "learning_rate": 3.281465419672603e-07, + "loss": 1.0738, + "step": 3644 + }, + { + "epoch": 2.758751182592242, + "grad_norm": 2.2246665954589844, + "learning_rate": 3.261160431855982e-07, + "loss": 1.0967, + "step": 3645 + }, + { + "epoch": 2.759508041627247, + "grad_norm": 2.213003396987915, + "learning_rate": 3.240917806865891e-07, + "loss": 1.1179, + "step": 3646 + }, + { + "epoch": 2.7602649006622517, + "grad_norm": 2.001859426498413, + "learning_rate": 3.2207375577504196e-07, + "loss": 1.0601, + "step": 3647 + }, + { + "epoch": 2.7610217596972566, + "grad_norm": 1.9976732730865479, + "learning_rate": 3.2006196975174716e-07, + "loss": 1.0809, + "step": 3648 + }, + { + "epoch": 2.761778618732261, + "grad_norm": 2.0903263092041016, + "learning_rate": 3.1805642391346757e-07, + "loss": 1.1196, + "step": 3649 + }, + { + "epoch": 2.762535477767266, + "grad_norm": 2.2454066276550293, + "learning_rate": 3.160571195529498e-07, + "loss": 1.1021, + "step": 3650 + }, + { + "epoch": 2.7632923368022704, + "grad_norm": 2.1280694007873535, + "learning_rate": 3.1406405795891286e-07, + "loss": 1.1027, + "step": 3651 + }, + { + "epoch": 2.7640491958372753, + "grad_norm": 2.1310126781463623, + "learning_rate": 3.1207724041605493e-07, + "loss": 0.9978, + "step": 3652 + }, + { + "epoch": 2.76480605487228, + "grad_norm": 2.2121293544769287, + "learning_rate": 3.1009666820505004e-07, + "loss": 1.0708, + "step": 3653 + }, + { + "epoch": 2.765562913907285, + "grad_norm": 2.256673812866211, + "learning_rate": 3.081223426025437e-07, + "loss": 1.094, + "step": 3654 + }, + { + "epoch": 2.7663197729422895, + "grad_norm": 2.2821056842803955, + "learning_rate": 3.0615426488115385e-07, + "loss": 1.0542, + "step": 3655 + }, + { + "epoch": 2.7670766319772944, + "grad_norm": 2.1040828227996826, + "learning_rate": 3.0419243630947764e-07, + "loss": 1.0439, + "step": 3656 + }, + { + "epoch": 2.767833491012299, + "grad_norm": 2.050218343734741, + "learning_rate": 3.022368581520758e-07, + "loss": 1.0747, + "step": 3657 + }, + { + "epoch": 2.7685903500473037, + "grad_norm": 1.962795376777649, + "learning_rate": 3.0028753166948504e-07, + "loss": 1.1227, + "step": 3658 + }, + { + "epoch": 2.7693472090823086, + "grad_norm": 2.256727933883667, + "learning_rate": 2.983444581182144e-07, + "loss": 1.123, + "step": 3659 + }, + { + "epoch": 2.770104068117313, + "grad_norm": 2.0236082077026367, + "learning_rate": 2.964076387507367e-07, + "loss": 1.0793, + "step": 3660 + }, + { + "epoch": 2.770860927152318, + "grad_norm": 2.2242016792297363, + "learning_rate": 2.944770748154961e-07, + "loss": 1.0849, + "step": 3661 + }, + { + "epoch": 2.7716177861873224, + "grad_norm": 1.8702110052108765, + "learning_rate": 2.9255276755690594e-07, + "loss": 1.0598, + "step": 3662 + }, + { + "epoch": 2.7723746452223272, + "grad_norm": 2.0809333324432373, + "learning_rate": 2.9063471821534544e-07, + "loss": 1.1008, + "step": 3663 + }, + { + "epoch": 2.773131504257332, + "grad_norm": 2.0802369117736816, + "learning_rate": 2.8872292802715856e-07, + "loss": 1.0757, + "step": 3664 + }, + { + "epoch": 2.773888363292337, + "grad_norm": 2.361516237258911, + "learning_rate": 2.868173982246573e-07, + "loss": 1.1079, + "step": 3665 + }, + { + "epoch": 2.7746452223273415, + "grad_norm": 2.069173574447632, + "learning_rate": 2.8491813003611614e-07, + "loss": 1.0559, + "step": 3666 + }, + { + "epoch": 2.7754020813623463, + "grad_norm": 1.9263156652450562, + "learning_rate": 2.830251246857745e-07, + "loss": 1.054, + "step": 3667 + }, + { + "epoch": 2.776158940397351, + "grad_norm": 1.9880831241607666, + "learning_rate": 2.811383833938352e-07, + "loss": 1.0915, + "step": 3668 + }, + { + "epoch": 2.7769157994323557, + "grad_norm": 1.977330207824707, + "learning_rate": 2.7925790737646375e-07, + "loss": 1.0582, + "step": 3669 + }, + { + "epoch": 2.7776726584673606, + "grad_norm": 2.2954440116882324, + "learning_rate": 2.7738369784578694e-07, + "loss": 1.093, + "step": 3670 + }, + { + "epoch": 2.7784295175023654, + "grad_norm": 1.9425572156906128, + "learning_rate": 2.755157560098875e-07, + "loss": 1.0558, + "step": 3671 + }, + { + "epoch": 2.77918637653737, + "grad_norm": 1.9914302825927734, + "learning_rate": 2.736540830728152e-07, + "loss": 1.1168, + "step": 3672 + }, + { + "epoch": 2.7799432355723748, + "grad_norm": 2.00032114982605, + "learning_rate": 2.717986802345765e-07, + "loss": 1.0692, + "step": 3673 + }, + { + "epoch": 2.780700094607379, + "grad_norm": 2.004713535308838, + "learning_rate": 2.6994954869113416e-07, + "loss": 1.0587, + "step": 3674 + }, + { + "epoch": 2.781456953642384, + "grad_norm": 2.0385048389434814, + "learning_rate": 2.6810668963441194e-07, + "loss": 1.064, + "step": 3675 + }, + { + "epoch": 2.782213812677389, + "grad_norm": 1.966386079788208, + "learning_rate": 2.662701042522874e-07, + "loss": 1.0086, + "step": 3676 + }, + { + "epoch": 2.7829706717123934, + "grad_norm": 2.2672669887542725, + "learning_rate": 2.644397937285963e-07, + "loss": 1.0867, + "step": 3677 + }, + { + "epoch": 2.7837275307473983, + "grad_norm": 1.9181667566299438, + "learning_rate": 2.626157592431278e-07, + "loss": 1.0969, + "step": 3678 + }, + { + "epoch": 2.7844843897824028, + "grad_norm": 2.0945870876312256, + "learning_rate": 2.607980019716272e-07, + "loss": 1.0493, + "step": 3679 + }, + { + "epoch": 2.7852412488174076, + "grad_norm": 1.9859826564788818, + "learning_rate": 2.589865230857932e-07, + "loss": 1.0695, + "step": 3680 + }, + { + "epoch": 2.7859981078524125, + "grad_norm": 1.9504257440567017, + "learning_rate": 2.5718132375327933e-07, + "loss": 1.0653, + "step": 3681 + }, + { + "epoch": 2.7867549668874174, + "grad_norm": 1.9905445575714111, + "learning_rate": 2.5538240513768625e-07, + "loss": 1.0907, + "step": 3682 + }, + { + "epoch": 2.787511825922422, + "grad_norm": 2.076355457305908, + "learning_rate": 2.535897683985702e-07, + "loss": 1.0448, + "step": 3683 + }, + { + "epoch": 2.7882686849574267, + "grad_norm": 1.986864447593689, + "learning_rate": 2.518034146914401e-07, + "loss": 1.0296, + "step": 3684 + }, + { + "epoch": 2.789025543992431, + "grad_norm": 1.9109041690826416, + "learning_rate": 2.5002334516774865e-07, + "loss": 1.0455, + "step": 3685 + }, + { + "epoch": 2.789782403027436, + "grad_norm": 2.183528423309326, + "learning_rate": 2.482495609749042e-07, + "loss": 1.0824, + "step": 3686 + }, + { + "epoch": 2.790539262062441, + "grad_norm": 2.090740203857422, + "learning_rate": 2.4648206325626e-07, + "loss": 1.0755, + "step": 3687 + }, + { + "epoch": 2.791296121097446, + "grad_norm": 2.1614151000976562, + "learning_rate": 2.447208531511184e-07, + "loss": 1.0617, + "step": 3688 + }, + { + "epoch": 2.7920529801324503, + "grad_norm": 1.9354277849197388, + "learning_rate": 2.429659317947277e-07, + "loss": 1.0702, + "step": 3689 + }, + { + "epoch": 2.792809839167455, + "grad_norm": 2.077448606491089, + "learning_rate": 2.412173003182842e-07, + "loss": 1.0656, + "step": 3690 + }, + { + "epoch": 2.7935666982024596, + "grad_norm": 1.9370477199554443, + "learning_rate": 2.394749598489302e-07, + "loss": 1.1324, + "step": 3691 + }, + { + "epoch": 2.7943235572374645, + "grad_norm": 2.0902650356292725, + "learning_rate": 2.3773891150975041e-07, + "loss": 1.1173, + "step": 3692 + }, + { + "epoch": 2.7950804162724694, + "grad_norm": 2.1968994140625, + "learning_rate": 2.3600915641977443e-07, + "loss": 1.1001, + "step": 3693 + }, + { + "epoch": 2.795837275307474, + "grad_norm": 1.9441262483596802, + "learning_rate": 2.342856956939765e-07, + "loss": 1.0932, + "step": 3694 + }, + { + "epoch": 2.7965941343424787, + "grad_norm": 2.0278730392456055, + "learning_rate": 2.3256853044327348e-07, + "loss": 1.1073, + "step": 3695 + }, + { + "epoch": 2.7973509933774836, + "grad_norm": 1.9890429973602295, + "learning_rate": 2.308576617745247e-07, + "loss": 1.0403, + "step": 3696 + }, + { + "epoch": 2.798107852412488, + "grad_norm": 1.9365586042404175, + "learning_rate": 2.2915309079052886e-07, + "loss": 1.0604, + "step": 3697 + }, + { + "epoch": 2.798864711447493, + "grad_norm": 2.051670789718628, + "learning_rate": 2.2745481859002917e-07, + "loss": 1.0739, + "step": 3698 + }, + { + "epoch": 2.799621570482498, + "grad_norm": 1.945073127746582, + "learning_rate": 2.2576284626770157e-07, + "loss": 1.0621, + "step": 3699 + }, + { + "epoch": 2.8003784295175023, + "grad_norm": 2.0430312156677246, + "learning_rate": 2.2407717491416676e-07, + "loss": 1.058, + "step": 3700 + }, + { + "epoch": 2.801135288552507, + "grad_norm": 2.074920177459717, + "learning_rate": 2.2239780561598455e-07, + "loss": 1.0765, + "step": 3701 + }, + { + "epoch": 2.8018921475875116, + "grad_norm": 2.323629140853882, + "learning_rate": 2.2072473945564961e-07, + "loss": 1.1025, + "step": 3702 + }, + { + "epoch": 2.8026490066225165, + "grad_norm": 1.8663524389266968, + "learning_rate": 2.1905797751159689e-07, + "loss": 1.0688, + "step": 3703 + }, + { + "epoch": 2.8034058656575214, + "grad_norm": 2.141047239303589, + "learning_rate": 2.1739752085819388e-07, + "loss": 1.0787, + "step": 3704 + }, + { + "epoch": 2.8041627246925263, + "grad_norm": 2.179725408554077, + "learning_rate": 2.15743370565744e-07, + "loss": 1.0848, + "step": 3705 + }, + { + "epoch": 2.8049195837275307, + "grad_norm": 2.0025246143341064, + "learning_rate": 2.1409552770048975e-07, + "loss": 1.0256, + "step": 3706 + }, + { + "epoch": 2.8056764427625356, + "grad_norm": 2.1321537494659424, + "learning_rate": 2.124539933246042e-07, + "loss": 1.1045, + "step": 3707 + }, + { + "epoch": 2.80643330179754, + "grad_norm": 2.0465590953826904, + "learning_rate": 2.108187684961972e-07, + "loss": 1.1277, + "step": 3708 + }, + { + "epoch": 2.807190160832545, + "grad_norm": 2.1039795875549316, + "learning_rate": 2.091898542693078e-07, + "loss": 1.0712, + "step": 3709 + }, + { + "epoch": 2.80794701986755, + "grad_norm": 2.3281686305999756, + "learning_rate": 2.0756725169391007e-07, + "loss": 1.1043, + "step": 3710 + }, + { + "epoch": 2.8087038789025542, + "grad_norm": 2.027113914489746, + "learning_rate": 2.0595096181591037e-07, + "loss": 1.1017, + "step": 3711 + }, + { + "epoch": 2.809460737937559, + "grad_norm": 2.1701509952545166, + "learning_rate": 2.04340985677141e-07, + "loss": 1.0812, + "step": 3712 + }, + { + "epoch": 2.810217596972564, + "grad_norm": 2.0328516960144043, + "learning_rate": 2.0273732431537025e-07, + "loss": 1.0951, + "step": 3713 + }, + { + "epoch": 2.8109744560075685, + "grad_norm": 2.0888283252716064, + "learning_rate": 2.0113997876429446e-07, + "loss": 1.1183, + "step": 3714 + }, + { + "epoch": 2.8117313150425733, + "grad_norm": 1.8546501398086548, + "learning_rate": 1.9954895005353692e-07, + "loss": 1.0742, + "step": 3715 + }, + { + "epoch": 2.8124881740775782, + "grad_norm": 1.985583782196045, + "learning_rate": 1.9796423920865021e-07, + "loss": 1.1027, + "step": 3716 + }, + { + "epoch": 2.8132450331125827, + "grad_norm": 2.0510141849517822, + "learning_rate": 1.9638584725111498e-07, + "loss": 1.0609, + "step": 3717 + }, + { + "epoch": 2.8140018921475876, + "grad_norm": 2.359945058822632, + "learning_rate": 1.9481377519834112e-07, + "loss": 1.0421, + "step": 3718 + }, + { + "epoch": 2.814758751182592, + "grad_norm": 2.107235908508301, + "learning_rate": 1.9324802406365883e-07, + "loss": 1.0114, + "step": 3719 + }, + { + "epoch": 2.815515610217597, + "grad_norm": 1.9575122594833374, + "learning_rate": 1.9168859485632866e-07, + "loss": 1.0763, + "step": 3720 + }, + { + "epoch": 2.8162724692526018, + "grad_norm": 2.15492582321167, + "learning_rate": 1.901354885815348e-07, + "loss": 1.0527, + "step": 3721 + }, + { + "epoch": 2.8170293282876067, + "grad_norm": 2.049591302871704, + "learning_rate": 1.8858870624038632e-07, + "loss": 1.0314, + "step": 3722 + }, + { + "epoch": 2.817786187322611, + "grad_norm": 2.623854875564575, + "learning_rate": 1.8704824882991584e-07, + "loss": 1.0589, + "step": 3723 + }, + { + "epoch": 2.818543046357616, + "grad_norm": 1.8997153043746948, + "learning_rate": 1.8551411734307744e-07, + "loss": 1.0561, + "step": 3724 + }, + { + "epoch": 2.8192999053926204, + "grad_norm": 2.2084269523620605, + "learning_rate": 1.8398631276875118e-07, + "loss": 1.0703, + "step": 3725 + }, + { + "epoch": 2.8200567644276253, + "grad_norm": 1.8947069644927979, + "learning_rate": 1.82464836091734e-07, + "loss": 1.0454, + "step": 3726 + }, + { + "epoch": 2.82081362346263, + "grad_norm": 1.8694313764572144, + "learning_rate": 1.8094968829274663e-07, + "loss": 1.0885, + "step": 3727 + }, + { + "epoch": 2.821570482497635, + "grad_norm": 2.0910801887512207, + "learning_rate": 1.7944087034843233e-07, + "loss": 1.0546, + "step": 3728 + }, + { + "epoch": 2.8223273415326395, + "grad_norm": 1.8177095651626587, + "learning_rate": 1.7793838323135016e-07, + "loss": 1.0405, + "step": 3729 + }, + { + "epoch": 2.8230842005676444, + "grad_norm": 2.050400733947754, + "learning_rate": 1.7644222790998186e-07, + "loss": 1.0905, + "step": 3730 + }, + { + "epoch": 2.823841059602649, + "grad_norm": 1.9098093509674072, + "learning_rate": 1.7495240534872614e-07, + "loss": 1.031, + "step": 3731 + }, + { + "epoch": 2.8245979186376537, + "grad_norm": 2.1355783939361572, + "learning_rate": 1.734689165078998e-07, + "loss": 1.0944, + "step": 3732 + }, + { + "epoch": 2.8253547776726586, + "grad_norm": 1.9840859174728394, + "learning_rate": 1.7199176234373553e-07, + "loss": 0.9905, + "step": 3733 + }, + { + "epoch": 2.826111636707663, + "grad_norm": 2.0721471309661865, + "learning_rate": 1.7052094380838532e-07, + "loss": 1.059, + "step": 3734 + }, + { + "epoch": 2.826868495742668, + "grad_norm": 2.345816135406494, + "learning_rate": 1.69056461849917e-07, + "loss": 1.0611, + "step": 3735 + }, + { + "epoch": 2.8276253547776724, + "grad_norm": 2.2599689960479736, + "learning_rate": 1.675983174123143e-07, + "loss": 1.0965, + "step": 3736 + }, + { + "epoch": 2.8283822138126773, + "grad_norm": 2.1439452171325684, + "learning_rate": 1.6614651143547243e-07, + "loss": 1.0312, + "step": 3737 + }, + { + "epoch": 2.829139072847682, + "grad_norm": 2.0652458667755127, + "learning_rate": 1.647010448552047e-07, + "loss": 1.0469, + "step": 3738 + }, + { + "epoch": 2.829895931882687, + "grad_norm": 2.0724799633026123, + "learning_rate": 1.63261918603237e-07, + "loss": 1.1211, + "step": 3739 + }, + { + "epoch": 2.8306527909176915, + "grad_norm": 1.8345634937286377, + "learning_rate": 1.618291336072078e-07, + "loss": 1.0573, + "step": 3740 + }, + { + "epoch": 2.8314096499526964, + "grad_norm": 1.9076229333877563, + "learning_rate": 1.6040269079066806e-07, + "loss": 1.0767, + "step": 3741 + }, + { + "epoch": 2.832166508987701, + "grad_norm": 1.9758639335632324, + "learning_rate": 1.5898259107308255e-07, + "loss": 1.0678, + "step": 3742 + }, + { + "epoch": 2.8329233680227057, + "grad_norm": 1.982330560684204, + "learning_rate": 1.5756883536982296e-07, + "loss": 1.1126, + "step": 3743 + }, + { + "epoch": 2.8336802270577106, + "grad_norm": 1.9688644409179688, + "learning_rate": 1.5616142459217799e-07, + "loss": 1.1118, + "step": 3744 + }, + { + "epoch": 2.8344370860927155, + "grad_norm": 2.031545639038086, + "learning_rate": 1.5476035964734117e-07, + "loss": 1.061, + "step": 3745 + }, + { + "epoch": 2.83519394512772, + "grad_norm": 1.9219672679901123, + "learning_rate": 1.5336564143841856e-07, + "loss": 1.031, + "step": 3746 + }, + { + "epoch": 2.835950804162725, + "grad_norm": 2.1881892681121826, + "learning_rate": 1.5197727086442445e-07, + "loss": 1.0689, + "step": 3747 + }, + { + "epoch": 2.8367076631977293, + "grad_norm": 1.885879397392273, + "learning_rate": 1.505952488202789e-07, + "loss": 1.0866, + "step": 3748 + }, + { + "epoch": 2.837464522232734, + "grad_norm": 2.17256760597229, + "learning_rate": 1.492195761968146e-07, + "loss": 1.0774, + "step": 3749 + }, + { + "epoch": 2.838221381267739, + "grad_norm": 2.0141475200653076, + "learning_rate": 1.4785025388076906e-07, + "loss": 1.1078, + "step": 3750 + }, + { + "epoch": 2.8389782403027435, + "grad_norm": 2.1120545864105225, + "learning_rate": 1.4648728275478566e-07, + "loss": 1.0698, + "step": 3751 + }, + { + "epoch": 2.8397350993377484, + "grad_norm": 1.9993555545806885, + "learning_rate": 1.451306636974159e-07, + "loss": 1.0529, + "step": 3752 + }, + { + "epoch": 2.840491958372753, + "grad_norm": 1.9042015075683594, + "learning_rate": 1.4378039758311616e-07, + "loss": 1.035, + "step": 3753 + }, + { + "epoch": 2.8412488174077577, + "grad_norm": 2.0726895332336426, + "learning_rate": 1.4243648528224414e-07, + "loss": 1.0772, + "step": 3754 + }, + { + "epoch": 2.8420056764427626, + "grad_norm": 2.004347085952759, + "learning_rate": 1.4109892766106804e-07, + "loss": 1.066, + "step": 3755 + }, + { + "epoch": 2.8427625354777675, + "grad_norm": 2.1998095512390137, + "learning_rate": 1.397677255817563e-07, + "loss": 1.1011, + "step": 3756 + }, + { + "epoch": 2.843519394512772, + "grad_norm": 1.9528348445892334, + "learning_rate": 1.3844287990238113e-07, + "loss": 1.105, + "step": 3757 + }, + { + "epoch": 2.844276253547777, + "grad_norm": 2.1867001056671143, + "learning_rate": 1.3712439147691946e-07, + "loss": 1.0787, + "step": 3758 + }, + { + "epoch": 2.8450331125827812, + "grad_norm": 2.0233795642852783, + "learning_rate": 1.3581226115524753e-07, + "loss": 1.0587, + "step": 3759 + }, + { + "epoch": 2.845789971617786, + "grad_norm": 1.9928818941116333, + "learning_rate": 1.345064897831441e-07, + "loss": 1.0421, + "step": 3760 + }, + { + "epoch": 2.846546830652791, + "grad_norm": 1.9571059942245483, + "learning_rate": 1.3320707820229063e-07, + "loss": 1.0569, + "step": 3761 + }, + { + "epoch": 2.847303689687796, + "grad_norm": 2.076955795288086, + "learning_rate": 1.3191402725026765e-07, + "loss": 1.0854, + "step": 3762 + }, + { + "epoch": 2.8480605487228003, + "grad_norm": 2.1233267784118652, + "learning_rate": 1.3062733776055504e-07, + "loss": 1.0457, + "step": 3763 + }, + { + "epoch": 2.8488174077578052, + "grad_norm": 1.9417656660079956, + "learning_rate": 1.2934701056253526e-07, + "loss": 1.0308, + "step": 3764 + }, + { + "epoch": 2.8495742667928097, + "grad_norm": 1.9117321968078613, + "learning_rate": 1.2807304648148552e-07, + "loss": 1.0519, + "step": 3765 + }, + { + "epoch": 2.8503311258278146, + "grad_norm": 1.987637996673584, + "learning_rate": 1.2680544633858457e-07, + "loss": 1.0602, + "step": 3766 + }, + { + "epoch": 2.8510879848628194, + "grad_norm": 2.072512626647949, + "learning_rate": 1.2554421095090923e-07, + "loss": 1.1063, + "step": 3767 + }, + { + "epoch": 2.851844843897824, + "grad_norm": 2.4176509380340576, + "learning_rate": 1.2428934113143005e-07, + "loss": 1.0889, + "step": 3768 + }, + { + "epoch": 2.8526017029328288, + "grad_norm": 2.25588059425354, + "learning_rate": 1.2304083768902016e-07, + "loss": 1.0894, + "step": 3769 + }, + { + "epoch": 2.853358561967833, + "grad_norm": 1.9961562156677246, + "learning_rate": 1.2179870142844305e-07, + "loss": 1.07, + "step": 3770 + }, + { + "epoch": 2.854115421002838, + "grad_norm": 2.0790538787841797, + "learning_rate": 1.2056293315036139e-07, + "loss": 1.1308, + "step": 3771 + }, + { + "epoch": 2.854872280037843, + "grad_norm": 2.107841968536377, + "learning_rate": 1.1933353365133393e-07, + "loss": 1.1053, + "step": 3772 + }, + { + "epoch": 2.855629139072848, + "grad_norm": 1.9669723510742188, + "learning_rate": 1.1811050372381292e-07, + "loss": 1.1049, + "step": 3773 + }, + { + "epoch": 2.8563859981078523, + "grad_norm": 1.9607486724853516, + "learning_rate": 1.1689384415614223e-07, + "loss": 1.0817, + "step": 3774 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 1.8808550834655762, + "learning_rate": 1.1568355573256491e-07, + "loss": 1.0328, + "step": 3775 + }, + { + "epoch": 2.8578997161778616, + "grad_norm": 2.0765459537506104, + "learning_rate": 1.1447963923321327e-07, + "loss": 1.0449, + "step": 3776 + }, + { + "epoch": 2.8586565752128665, + "grad_norm": 1.9006658792495728, + "learning_rate": 1.1328209543411224e-07, + "loss": 1.0046, + "step": 3777 + }, + { + "epoch": 2.8594134342478714, + "grad_norm": 2.1098666191101074, + "learning_rate": 1.1209092510718261e-07, + "loss": 1.0837, + "step": 3778 + }, + { + "epoch": 2.8601702932828763, + "grad_norm": 2.088935375213623, + "learning_rate": 1.1090612902023337e-07, + "loss": 1.0702, + "step": 3779 + }, + { + "epoch": 2.8609271523178808, + "grad_norm": 2.2443082332611084, + "learning_rate": 1.0972770793696717e-07, + "loss": 1.0864, + "step": 3780 + }, + { + "epoch": 2.8616840113528856, + "grad_norm": 2.380600929260254, + "learning_rate": 1.0855566261697372e-07, + "loss": 1.073, + "step": 3781 + }, + { + "epoch": 2.86244087038789, + "grad_norm": 2.0619399547576904, + "learning_rate": 1.073899938157375e-07, + "loss": 1.0486, + "step": 3782 + }, + { + "epoch": 2.863197729422895, + "grad_norm": 1.9430749416351318, + "learning_rate": 1.0623070228463008e-07, + "loss": 1.0664, + "step": 3783 + }, + { + "epoch": 2.8639545884579, + "grad_norm": 1.9037846326828003, + "learning_rate": 1.0507778877091445e-07, + "loss": 1.041, + "step": 3784 + }, + { + "epoch": 2.8647114474929043, + "grad_norm": 1.8566458225250244, + "learning_rate": 1.0393125401773843e-07, + "loss": 1.066, + "step": 3785 + }, + { + "epoch": 2.865468306527909, + "grad_norm": 1.8834096193313599, + "learning_rate": 1.027910987641447e-07, + "loss": 1.0902, + "step": 3786 + }, + { + "epoch": 2.866225165562914, + "grad_norm": 2.1945738792419434, + "learning_rate": 1.0165732374505733e-07, + "loss": 1.0731, + "step": 3787 + }, + { + "epoch": 2.8669820245979185, + "grad_norm": 2.057518720626831, + "learning_rate": 1.0052992969128971e-07, + "loss": 1.0807, + "step": 3788 + }, + { + "epoch": 2.8677388836329234, + "grad_norm": 1.897512435913086, + "learning_rate": 9.940891732954447e-08, + "loss": 1.0146, + "step": 3789 + }, + { + "epoch": 2.8684957426679283, + "grad_norm": 1.9884440898895264, + "learning_rate": 9.829428738240904e-08, + "loss": 1.0896, + "step": 3790 + }, + { + "epoch": 2.8692526017029327, + "grad_norm": 2.2165613174438477, + "learning_rate": 9.718604056835573e-08, + "loss": 1.014, + "step": 3791 + }, + { + "epoch": 2.8700094607379376, + "grad_norm": 2.517817258834839, + "learning_rate": 9.608417760174488e-08, + "loss": 1.0497, + "step": 3792 + }, + { + "epoch": 2.870766319772942, + "grad_norm": 1.8578647375106812, + "learning_rate": 9.498869919281952e-08, + "loss": 1.064, + "step": 3793 + }, + { + "epoch": 2.871523178807947, + "grad_norm": 2.168428897857666, + "learning_rate": 9.389960604770966e-08, + "loss": 1.1338, + "step": 3794 + }, + { + "epoch": 2.872280037842952, + "grad_norm": 1.902740716934204, + "learning_rate": 9.281689886842575e-08, + "loss": 1.0833, + "step": 3795 + }, + { + "epoch": 2.8730368968779567, + "grad_norm": 2.10799503326416, + "learning_rate": 9.174057835286632e-08, + "loss": 1.0498, + "step": 3796 + }, + { + "epoch": 2.873793755912961, + "grad_norm": 1.9497560262680054, + "learning_rate": 9.067064519481139e-08, + "loss": 1.0824, + "step": 3797 + }, + { + "epoch": 2.874550614947966, + "grad_norm": 2.052687406539917, + "learning_rate": 8.96071000839214e-08, + "loss": 1.0838, + "step": 3798 + }, + { + "epoch": 2.8753074739829705, + "grad_norm": 2.033168315887451, + "learning_rate": 8.854994370574378e-08, + "loss": 1.0696, + "step": 3799 + }, + { + "epoch": 2.8760643330179754, + "grad_norm": 1.928364872932434, + "learning_rate": 8.749917674170415e-08, + "loss": 1.0824, + "step": 3800 + }, + { + "epoch": 2.8768211920529803, + "grad_norm": 1.9337732791900635, + "learning_rate": 8.645479986911066e-08, + "loss": 1.117, + "step": 3801 + }, + { + "epoch": 2.8775780510879847, + "grad_norm": 1.929337501525879, + "learning_rate": 8.541681376115416e-08, + "loss": 1.0448, + "step": 3802 + }, + { + "epoch": 2.8783349101229896, + "grad_norm": 2.0957815647125244, + "learning_rate": 8.438521908690244e-08, + "loss": 1.0833, + "step": 3803 + }, + { + "epoch": 2.8790917691579945, + "grad_norm": 2.090304136276245, + "learning_rate": 8.336001651130706e-08, + "loss": 1.0567, + "step": 3804 + }, + { + "epoch": 2.879848628192999, + "grad_norm": 2.115295648574829, + "learning_rate": 8.234120669519771e-08, + "loss": 1.031, + "step": 3805 + }, + { + "epoch": 2.880605487228004, + "grad_norm": 1.9636808633804321, + "learning_rate": 8.132879029528445e-08, + "loss": 1.0494, + "step": 3806 + }, + { + "epoch": 2.8813623462630087, + "grad_norm": 1.9101999998092651, + "learning_rate": 8.03227679641533e-08, + "loss": 1.0571, + "step": 3807 + }, + { + "epoch": 2.882119205298013, + "grad_norm": 1.9551316499710083, + "learning_rate": 7.932314035027393e-08, + "loss": 1.0658, + "step": 3808 + }, + { + "epoch": 2.882876064333018, + "grad_norm": 2.1239876747131348, + "learning_rate": 7.832990809798869e-08, + "loss": 1.0788, + "step": 3809 + }, + { + "epoch": 2.8836329233680225, + "grad_norm": 1.9690558910369873, + "learning_rate": 7.734307184752134e-08, + "loss": 1.0772, + "step": 3810 + }, + { + "epoch": 2.8843897824030273, + "grad_norm": 2.071542263031006, + "learning_rate": 7.636263223496941e-08, + "loss": 1.0839, + "step": 3811 + }, + { + "epoch": 2.8851466414380322, + "grad_norm": 2.1702964305877686, + "learning_rate": 7.538858989231189e-08, + "loss": 1.0452, + "step": 3812 + }, + { + "epoch": 2.885903500473037, + "grad_norm": 2.1600115299224854, + "learning_rate": 7.442094544740037e-08, + "loss": 1.133, + "step": 3813 + }, + { + "epoch": 2.8866603595080416, + "grad_norm": 1.943969964981079, + "learning_rate": 7.34596995239646e-08, + "loss": 1.0342, + "step": 3814 + }, + { + "epoch": 2.8874172185430464, + "grad_norm": 2.029170513153076, + "learning_rate": 7.250485274160693e-08, + "loss": 1.0983, + "step": 3815 + }, + { + "epoch": 2.888174077578051, + "grad_norm": 2.1345629692077637, + "learning_rate": 7.1556405715809e-08, + "loss": 1.0854, + "step": 3816 + }, + { + "epoch": 2.8889309366130558, + "grad_norm": 1.8675469160079956, + "learning_rate": 7.061435905792389e-08, + "loss": 1.0661, + "step": 3817 + }, + { + "epoch": 2.8896877956480607, + "grad_norm": 2.0254111289978027, + "learning_rate": 6.967871337518176e-08, + "loss": 1.103, + "step": 3818 + }, + { + "epoch": 2.8904446546830656, + "grad_norm": 2.162344455718994, + "learning_rate": 6.874946927068538e-08, + "loss": 1.0878, + "step": 3819 + }, + { + "epoch": 2.89120151371807, + "grad_norm": 2.2124130725860596, + "learning_rate": 6.782662734341012e-08, + "loss": 1.0599, + "step": 3820 + }, + { + "epoch": 2.891958372753075, + "grad_norm": 2.1409800052642822, + "learning_rate": 6.691018818820837e-08, + "loss": 1.0525, + "step": 3821 + }, + { + "epoch": 2.8927152317880793, + "grad_norm": 2.1800687313079834, + "learning_rate": 6.600015239579959e-08, + "loss": 1.0602, + "step": 3822 + }, + { + "epoch": 2.893472090823084, + "grad_norm": 2.0903069972991943, + "learning_rate": 6.50965205527814e-08, + "loss": 1.0851, + "step": 3823 + }, + { + "epoch": 2.894228949858089, + "grad_norm": 1.9317938089370728, + "learning_rate": 6.419929324162068e-08, + "loss": 1.0319, + "step": 3824 + }, + { + "epoch": 2.8949858088930935, + "grad_norm": 2.0327014923095703, + "learning_rate": 6.330847104065472e-08, + "loss": 1.1128, + "step": 3825 + }, + { + "epoch": 2.8957426679280984, + "grad_norm": 2.1695809364318848, + "learning_rate": 6.242405452409559e-08, + "loss": 1.0591, + "step": 3826 + }, + { + "epoch": 2.896499526963103, + "grad_norm": 2.077954053878784, + "learning_rate": 6.154604426202468e-08, + "loss": 1.0295, + "step": 3827 + }, + { + "epoch": 2.8972563859981078, + "grad_norm": 2.0263519287109375, + "learning_rate": 6.067444082039482e-08, + "loss": 1.0147, + "step": 3828 + }, + { + "epoch": 2.8980132450331126, + "grad_norm": 2.1431772708892822, + "learning_rate": 5.980924476102595e-08, + "loss": 1.0512, + "step": 3829 + }, + { + "epoch": 2.8987701040681175, + "grad_norm": 1.9561032056808472, + "learning_rate": 5.895045664161168e-08, + "loss": 1.0426, + "step": 3830 + }, + { + "epoch": 2.899526963103122, + "grad_norm": 2.133995532989502, + "learning_rate": 5.8098077015713814e-08, + "loss": 1.0365, + "step": 3831 + }, + { + "epoch": 2.900283822138127, + "grad_norm": 2.5247886180877686, + "learning_rate": 5.7252106432762304e-08, + "loss": 1.1153, + "step": 3832 + }, + { + "epoch": 2.9010406811731313, + "grad_norm": 1.9548890590667725, + "learning_rate": 5.6412545438057476e-08, + "loss": 1.0739, + "step": 3833 + }, + { + "epoch": 2.901797540208136, + "grad_norm": 1.98203444480896, + "learning_rate": 5.557939457276783e-08, + "loss": 1.0844, + "step": 3834 + }, + { + "epoch": 2.902554399243141, + "grad_norm": 2.1283376216888428, + "learning_rate": 5.475265437393116e-08, + "loss": 1.0675, + "step": 3835 + }, + { + "epoch": 2.903311258278146, + "grad_norm": 1.917360782623291, + "learning_rate": 5.393232537444783e-08, + "loss": 1.0464, + "step": 3836 + }, + { + "epoch": 2.9040681173131504, + "grad_norm": 1.9345555305480957, + "learning_rate": 5.3118408103091954e-08, + "loss": 1.0937, + "step": 3837 + }, + { + "epoch": 2.9048249763481553, + "grad_norm": 2.1080758571624756, + "learning_rate": 5.2310903084502445e-08, + "loss": 1.0984, + "step": 3838 + }, + { + "epoch": 2.9055818353831597, + "grad_norm": 2.0316121578216553, + "learning_rate": 5.150981083918309e-08, + "loss": 1.105, + "step": 3839 + }, + { + "epoch": 2.9063386944181646, + "grad_norm": 2.1214966773986816, + "learning_rate": 5.0715131883506914e-08, + "loss": 1.0481, + "step": 3840 + }, + { + "epoch": 2.9070955534531695, + "grad_norm": 2.003058433532715, + "learning_rate": 4.99268667297129e-08, + "loss": 1.0848, + "step": 3841 + }, + { + "epoch": 2.907852412488174, + "grad_norm": 2.0405402183532715, + "learning_rate": 4.9145015885902656e-08, + "loss": 1.1065, + "step": 3842 + }, + { + "epoch": 2.908609271523179, + "grad_norm": 2.1864330768585205, + "learning_rate": 4.836957985604592e-08, + "loss": 1.1217, + "step": 3843 + }, + { + "epoch": 2.9093661305581833, + "grad_norm": 1.8275071382522583, + "learning_rate": 4.7600559139976164e-08, + "loss": 1.0634, + "step": 3844 + }, + { + "epoch": 2.910122989593188, + "grad_norm": 2.006591320037842, + "learning_rate": 4.683795423339395e-08, + "loss": 1.0702, + "step": 3845 + }, + { + "epoch": 2.910879848628193, + "grad_norm": 2.4923205375671387, + "learning_rate": 4.608176562786352e-08, + "loss": 1.0633, + "step": 3846 + }, + { + "epoch": 2.911636707663198, + "grad_norm": 1.8209044933319092, + "learning_rate": 4.533199381080951e-08, + "loss": 1.0856, + "step": 3847 + }, + { + "epoch": 2.9123935666982024, + "grad_norm": 2.0942399501800537, + "learning_rate": 4.458863926552586e-08, + "loss": 1.0936, + "step": 3848 + }, + { + "epoch": 2.9131504257332073, + "grad_norm": 2.149657964706421, + "learning_rate": 4.385170247116687e-08, + "loss": 1.0472, + "step": 3849 + }, + { + "epoch": 2.9139072847682117, + "grad_norm": 1.905176043510437, + "learning_rate": 4.3121183902750584e-08, + "loss": 1.0524, + "step": 3850 + }, + { + "epoch": 2.9146641438032166, + "grad_norm": 2.1164419651031494, + "learning_rate": 4.2397084031158755e-08, + "loss": 1.0406, + "step": 3851 + }, + { + "epoch": 2.9154210028382215, + "grad_norm": 1.955041527748108, + "learning_rate": 4.1679403323133525e-08, + "loss": 1.0788, + "step": 3852 + }, + { + "epoch": 2.9161778618732264, + "grad_norm": 2.0062403678894043, + "learning_rate": 4.096814224128301e-08, + "loss": 1.1013, + "step": 3853 + }, + { + "epoch": 2.916934720908231, + "grad_norm": 2.1404199600219727, + "learning_rate": 4.0263301244073465e-08, + "loss": 1.1179, + "step": 3854 + }, + { + "epoch": 2.9176915799432357, + "grad_norm": 1.968444585800171, + "learning_rate": 3.9564880785834875e-08, + "loss": 1.0376, + "step": 3855 + }, + { + "epoch": 2.91844843897824, + "grad_norm": 1.9066696166992188, + "learning_rate": 3.887288131676096e-08, + "loss": 1.0348, + "step": 3856 + }, + { + "epoch": 2.919205298013245, + "grad_norm": 2.04758620262146, + "learning_rate": 3.818730328290026e-08, + "loss": 1.0696, + "step": 3857 + }, + { + "epoch": 2.91996215704825, + "grad_norm": 1.8712373971939087, + "learning_rate": 3.750814712616839e-08, + "loss": 1.0559, + "step": 3858 + }, + { + "epoch": 2.9207190160832543, + "grad_norm": 1.9164494276046753, + "learning_rate": 3.6835413284338016e-08, + "loss": 1.0717, + "step": 3859 + }, + { + "epoch": 2.9214758751182592, + "grad_norm": 1.9720449447631836, + "learning_rate": 3.616910219104442e-08, + "loss": 1.1104, + "step": 3860 + }, + { + "epoch": 2.9222327341532637, + "grad_norm": 2.1905975341796875, + "learning_rate": 3.5509214275779944e-08, + "loss": 1.1058, + "step": 3861 + }, + { + "epoch": 2.9229895931882686, + "grad_norm": 1.912367582321167, + "learning_rate": 3.4855749963898434e-08, + "loss": 1.0694, + "step": 3862 + }, + { + "epoch": 2.9237464522232735, + "grad_norm": 2.054760456085205, + "learning_rate": 3.420870967661412e-08, + "loss": 1.0661, + "step": 3863 + }, + { + "epoch": 2.9245033112582783, + "grad_norm": 2.100724220275879, + "learning_rate": 3.3568093830998316e-08, + "loss": 1.0685, + "step": 3864 + }, + { + "epoch": 2.925260170293283, + "grad_norm": 1.9942377805709839, + "learning_rate": 3.2933902839982706e-08, + "loss": 1.0362, + "step": 3865 + }, + { + "epoch": 2.9260170293282877, + "grad_norm": 1.9845491647720337, + "learning_rate": 3.230613711235715e-08, + "loss": 1.0948, + "step": 3866 + }, + { + "epoch": 2.926773888363292, + "grad_norm": 1.9502067565917969, + "learning_rate": 3.168479705276969e-08, + "loss": 1.0292, + "step": 3867 + }, + { + "epoch": 2.927530747398297, + "grad_norm": 2.2125320434570312, + "learning_rate": 3.106988306172764e-08, + "loss": 1.0965, + "step": 3868 + }, + { + "epoch": 2.928287606433302, + "grad_norm": 2.0215206146240234, + "learning_rate": 3.046139553559317e-08, + "loss": 1.0451, + "step": 3869 + }, + { + "epoch": 2.9290444654683068, + "grad_norm": 2.161459445953369, + "learning_rate": 2.985933486658992e-08, + "loss": 1.0826, + "step": 3870 + }, + { + "epoch": 2.929801324503311, + "grad_norm": 2.112816572189331, + "learning_rate": 2.926370144279531e-08, + "loss": 1.1162, + "step": 3871 + }, + { + "epoch": 2.930558183538316, + "grad_norm": 2.043856382369995, + "learning_rate": 2.8674495648147115e-08, + "loss": 1.0377, + "step": 3872 + }, + { + "epoch": 2.9313150425733205, + "grad_norm": 2.230227470397949, + "learning_rate": 2.809171786243685e-08, + "loss": 1.0775, + "step": 3873 + }, + { + "epoch": 2.9320719016083254, + "grad_norm": 1.8375619649887085, + "learning_rate": 2.7515368461316434e-08, + "loss": 1.0217, + "step": 3874 + }, + { + "epoch": 2.9328287606433303, + "grad_norm": 1.9141755104064941, + "learning_rate": 2.694544781629039e-08, + "loss": 1.038, + "step": 3875 + }, + { + "epoch": 2.9335856196783348, + "grad_norm": 2.1602697372436523, + "learning_rate": 2.6381956294720323e-08, + "loss": 1.0714, + "step": 3876 + }, + { + "epoch": 2.9343424787133396, + "grad_norm": 1.923949956893921, + "learning_rate": 2.5824894259825987e-08, + "loss": 1.0561, + "step": 3877 + }, + { + "epoch": 2.9350993377483445, + "grad_norm": 2.1230146884918213, + "learning_rate": 2.5274262070678672e-08, + "loss": 1.0689, + "step": 3878 + }, + { + "epoch": 2.935856196783349, + "grad_norm": 2.0640814304351807, + "learning_rate": 2.4730060082210033e-08, + "loss": 1.0851, + "step": 3879 + }, + { + "epoch": 2.936613055818354, + "grad_norm": 1.8468024730682373, + "learning_rate": 2.4192288645203268e-08, + "loss": 1.0588, + "step": 3880 + }, + { + "epoch": 2.9373699148533587, + "grad_norm": 2.06715726852417, + "learning_rate": 2.3660948106297502e-08, + "loss": 1.04, + "step": 3881 + }, + { + "epoch": 2.938126773888363, + "grad_norm": 2.129422426223755, + "learning_rate": 2.313603880798671e-08, + "loss": 1.0935, + "step": 3882 + }, + { + "epoch": 2.938883632923368, + "grad_norm": 1.9994871616363525, + "learning_rate": 2.2617561088619707e-08, + "loss": 1.0823, + "step": 3883 + }, + { + "epoch": 2.9396404919583725, + "grad_norm": 1.9387072324752808, + "learning_rate": 2.2105515282399045e-08, + "loss": 1.0506, + "step": 3884 + }, + { + "epoch": 2.9403973509933774, + "grad_norm": 1.9773590564727783, + "learning_rate": 2.1599901719382117e-08, + "loss": 1.0713, + "step": 3885 + }, + { + "epoch": 2.9411542100283823, + "grad_norm": 1.9165699481964111, + "learning_rate": 2.110072072547893e-08, + "loss": 1.0889, + "step": 3886 + }, + { + "epoch": 2.941911069063387, + "grad_norm": 1.9767038822174072, + "learning_rate": 2.060797262245434e-08, + "loss": 1.1121, + "step": 3887 + }, + { + "epoch": 2.9426679280983916, + "grad_norm": 1.9442821741104126, + "learning_rate": 2.012165772792693e-08, + "loss": 1.0852, + "step": 3888 + }, + { + "epoch": 2.9434247871333965, + "grad_norm": 1.9988024234771729, + "learning_rate": 1.96417763553668e-08, + "loss": 1.0733, + "step": 3889 + }, + { + "epoch": 2.944181646168401, + "grad_norm": 2.0853540897369385, + "learning_rate": 1.91683288141e-08, + "loss": 1.0869, + "step": 3890 + }, + { + "epoch": 2.944938505203406, + "grad_norm": 1.8882020711898804, + "learning_rate": 1.8701315409300757e-08, + "loss": 1.0716, + "step": 3891 + }, + { + "epoch": 2.9456953642384107, + "grad_norm": 1.9533286094665527, + "learning_rate": 1.8240736442000363e-08, + "loss": 1.0977, + "step": 3892 + }, + { + "epoch": 2.946452223273415, + "grad_norm": 2.2869935035705566, + "learning_rate": 1.7786592209081624e-08, + "loss": 1.036, + "step": 3893 + }, + { + "epoch": 2.94720908230842, + "grad_norm": 1.8621643781661987, + "learning_rate": 1.733888300327774e-08, + "loss": 1.0655, + "step": 3894 + }, + { + "epoch": 2.947965941343425, + "grad_norm": 2.069187641143799, + "learning_rate": 1.689760911317565e-08, + "loss": 1.0701, + "step": 3895 + }, + { + "epoch": 2.9487228003784294, + "grad_norm": 2.112271547317505, + "learning_rate": 1.64627708232138e-08, + "loss": 1.0697, + "step": 3896 + }, + { + "epoch": 2.9494796594134343, + "grad_norm": 2.0788121223449707, + "learning_rate": 1.6034368413683266e-08, + "loss": 1.0746, + "step": 3897 + }, + { + "epoch": 2.950236518448439, + "grad_norm": 2.0578696727752686, + "learning_rate": 1.56124021607244e-08, + "loss": 1.0702, + "step": 3898 + }, + { + "epoch": 2.9509933774834436, + "grad_norm": 2.171917676925659, + "learning_rate": 1.519687233633019e-08, + "loss": 1.0755, + "step": 3899 + }, + { + "epoch": 2.9517502365184485, + "grad_norm": 1.9235490560531616, + "learning_rate": 1.4787779208345125e-08, + "loss": 1.0209, + "step": 3900 + }, + { + "epoch": 2.952507095553453, + "grad_norm": 2.046241283416748, + "learning_rate": 1.4385123040465213e-08, + "loss": 1.0691, + "step": 3901 + }, + { + "epoch": 2.953263954588458, + "grad_norm": 2.0865299701690674, + "learning_rate": 1.398890409223575e-08, + "loss": 1.0399, + "step": 3902 + }, + { + "epoch": 2.9540208136234627, + "grad_norm": 2.0731747150421143, + "learning_rate": 1.3599122619053542e-08, + "loss": 1.0499, + "step": 3903 + }, + { + "epoch": 2.9547776726584676, + "grad_norm": 2.198157548904419, + "learning_rate": 1.32157788721658e-08, + "loss": 1.0247, + "step": 3904 + }, + { + "epoch": 2.955534531693472, + "grad_norm": 2.0734500885009766, + "learning_rate": 1.2838873098669024e-08, + "loss": 1.0417, + "step": 3905 + }, + { + "epoch": 2.956291390728477, + "grad_norm": 1.8891007900238037, + "learning_rate": 1.2468405541513447e-08, + "loss": 1.0541, + "step": 3906 + }, + { + "epoch": 2.9570482497634814, + "grad_norm": 1.9352359771728516, + "learning_rate": 1.210437643949415e-08, + "loss": 1.0932, + "step": 3907 + }, + { + "epoch": 2.9578051087984862, + "grad_norm": 1.9413546323776245, + "learning_rate": 1.1746786027259944e-08, + "loss": 1.082, + "step": 3908 + }, + { + "epoch": 2.958561967833491, + "grad_norm": 2.091618299484253, + "learning_rate": 1.1395634535308943e-08, + "loss": 1.0216, + "step": 3909 + }, + { + "epoch": 2.959318826868496, + "grad_norm": 2.132253408432007, + "learning_rate": 1.1050922189986316e-08, + "loss": 1.0842, + "step": 3910 + }, + { + "epoch": 2.9600756859035005, + "grad_norm": 2.1053178310394287, + "learning_rate": 1.0712649213489865e-08, + "loss": 1.0316, + "step": 3911 + }, + { + "epoch": 2.9608325449385053, + "grad_norm": 2.1205570697784424, + "learning_rate": 1.0380815823864458e-08, + "loss": 1.065, + "step": 3912 + }, + { + "epoch": 2.96158940397351, + "grad_norm": 2.1178319454193115, + "learning_rate": 1.0055422235004254e-08, + "loss": 1.0666, + "step": 3913 + }, + { + "epoch": 2.9623462630085147, + "grad_norm": 2.021894693374634, + "learning_rate": 9.736468656653818e-09, + "loss": 1.0567, + "step": 3914 + }, + { + "epoch": 2.9631031220435196, + "grad_norm": 1.921276569366455, + "learning_rate": 9.423955294405891e-09, + "loss": 1.093, + "step": 3915 + }, + { + "epoch": 2.963859981078524, + "grad_norm": 2.062957286834717, + "learning_rate": 9.117882349702507e-09, + "loss": 1.0449, + "step": 3916 + }, + { + "epoch": 2.964616840113529, + "grad_norm": 2.258112668991089, + "learning_rate": 8.818250019831662e-09, + "loss": 1.0587, + "step": 3917 + }, + { + "epoch": 2.9653736991485333, + "grad_norm": 1.8993000984191895, + "learning_rate": 8.52505849793286e-09, + "loss": 1.0402, + "step": 3918 + }, + { + "epoch": 2.966130558183538, + "grad_norm": 1.8386201858520508, + "learning_rate": 8.23830797299268e-09, + "loss": 1.1062, + "step": 3919 + }, + { + "epoch": 2.966887417218543, + "grad_norm": 2.060410976409912, + "learning_rate": 7.957998629846991e-09, + "loss": 1.066, + "step": 3920 + }, + { + "epoch": 2.967644276253548, + "grad_norm": 2.098123073577881, + "learning_rate": 7.684130649177623e-09, + "loss": 1.0881, + "step": 3921 + }, + { + "epoch": 2.9684011352885524, + "grad_norm": 2.2169816493988037, + "learning_rate": 7.416704207515695e-09, + "loss": 1.0661, + "step": 3922 + }, + { + "epoch": 2.9691579943235573, + "grad_norm": 2.0316176414489746, + "learning_rate": 7.155719477241619e-09, + "loss": 1.0737, + "step": 3923 + }, + { + "epoch": 2.9699148533585618, + "grad_norm": 1.8836135864257812, + "learning_rate": 6.901176626581769e-09, + "loss": 1.0435, + "step": 3924 + }, + { + "epoch": 2.9706717123935666, + "grad_norm": 2.030869960784912, + "learning_rate": 6.653075819609588e-09, + "loss": 1.0358, + "step": 3925 + }, + { + "epoch": 2.9714285714285715, + "grad_norm": 2.1759679317474365, + "learning_rate": 6.411417216247812e-09, + "loss": 1.066, + "step": 3926 + }, + { + "epoch": 2.9721854304635764, + "grad_norm": 2.092773199081421, + "learning_rate": 6.176200972265136e-09, + "loss": 1.031, + "step": 3927 + }, + { + "epoch": 2.972942289498581, + "grad_norm": 2.3539814949035645, + "learning_rate": 5.947427239279547e-09, + "loss": 1.1136, + "step": 3928 + }, + { + "epoch": 2.9736991485335857, + "grad_norm": 2.3484017848968506, + "learning_rate": 5.725096164753884e-09, + "loss": 1.0145, + "step": 3929 + }, + { + "epoch": 2.97445600756859, + "grad_norm": 1.9310166835784912, + "learning_rate": 5.509207892001385e-09, + "loss": 1.0231, + "step": 3930 + }, + { + "epoch": 2.975212866603595, + "grad_norm": 1.9161075353622437, + "learning_rate": 5.299762560177922e-09, + "loss": 1.0041, + "step": 3931 + }, + { + "epoch": 2.9759697256386, + "grad_norm": 2.0112030506134033, + "learning_rate": 5.096760304289763e-09, + "loss": 1.1227, + "step": 3932 + }, + { + "epoch": 2.9767265846736044, + "grad_norm": 2.0244789123535156, + "learning_rate": 4.900201255189143e-09, + "loss": 1.0922, + "step": 3933 + }, + { + "epoch": 2.9774834437086093, + "grad_norm": 2.514658212661743, + "learning_rate": 4.710085539575363e-09, + "loss": 1.054, + "step": 3934 + }, + { + "epoch": 2.9782403027436137, + "grad_norm": 1.9655340909957886, + "learning_rate": 4.526413279993689e-09, + "loss": 1.0453, + "step": 3935 + }, + { + "epoch": 2.9789971617786186, + "grad_norm": 1.988950490951538, + "learning_rate": 4.349184594836453e-09, + "loss": 1.112, + "step": 3936 + }, + { + "epoch": 2.9797540208136235, + "grad_norm": 2.0003416538238525, + "learning_rate": 4.178399598341953e-09, + "loss": 1.0888, + "step": 3937 + }, + { + "epoch": 2.9805108798486284, + "grad_norm": 1.8892840147018433, + "learning_rate": 4.014058400597776e-09, + "loss": 1.0316, + "step": 3938 + }, + { + "epoch": 2.981267738883633, + "grad_norm": 1.8778574466705322, + "learning_rate": 3.856161107533029e-09, + "loss": 1.0264, + "step": 3939 + }, + { + "epoch": 2.9820245979186377, + "grad_norm": 1.9889436960220337, + "learning_rate": 3.70470782092722e-09, + "loss": 1.0775, + "step": 3940 + }, + { + "epoch": 2.982781456953642, + "grad_norm": 1.9453630447387695, + "learning_rate": 3.55969863840471e-09, + "loss": 1.0661, + "step": 3941 + }, + { + "epoch": 2.983538315988647, + "grad_norm": 2.1750118732452393, + "learning_rate": 3.421133653436929e-09, + "loss": 1.0582, + "step": 3942 + }, + { + "epoch": 2.984295175023652, + "grad_norm": 1.9517415761947632, + "learning_rate": 3.289012955339048e-09, + "loss": 1.061, + "step": 3943 + }, + { + "epoch": 2.985052034058657, + "grad_norm": 1.9353458881378174, + "learning_rate": 3.16333662927553e-09, + "loss": 1.0533, + "step": 3944 + }, + { + "epoch": 2.9858088930936613, + "grad_norm": 2.1572377681732178, + "learning_rate": 3.044104756254578e-09, + "loss": 1.0871, + "step": 3945 + }, + { + "epoch": 2.986565752128666, + "grad_norm": 1.9636902809143066, + "learning_rate": 2.9313174131325764e-09, + "loss": 1.0751, + "step": 3946 + }, + { + "epoch": 2.9873226111636706, + "grad_norm": 1.9330499172210693, + "learning_rate": 2.8249746726085392e-09, + "loss": 1.0858, + "step": 3947 + }, + { + "epoch": 2.9880794701986755, + "grad_norm": 1.8204699754714966, + "learning_rate": 2.7250766032307735e-09, + "loss": 1.034, + "step": 3948 + }, + { + "epoch": 2.9888363292336804, + "grad_norm": 1.9643014669418335, + "learning_rate": 2.6316232693913253e-09, + "loss": 1.062, + "step": 3949 + }, + { + "epoch": 2.989593188268685, + "grad_norm": 2.25469708442688, + "learning_rate": 2.544614731329312e-09, + "loss": 1.0581, + "step": 3950 + }, + { + "epoch": 2.9903500473036897, + "grad_norm": 2.0587730407714844, + "learning_rate": 2.464051045128703e-09, + "loss": 1.0528, + "step": 3951 + }, + { + "epoch": 2.9911069063386946, + "grad_norm": 1.9544748067855835, + "learning_rate": 2.389932262720538e-09, + "loss": 1.0655, + "step": 3952 + }, + { + "epoch": 2.991863765373699, + "grad_norm": 2.1084601879119873, + "learning_rate": 2.3222584318784854e-09, + "loss": 1.1346, + "step": 3953 + }, + { + "epoch": 2.992620624408704, + "grad_norm": 2.051255226135254, + "learning_rate": 2.261029596226618e-09, + "loss": 1.1014, + "step": 3954 + }, + { + "epoch": 2.993377483443709, + "grad_norm": 2.079298734664917, + "learning_rate": 2.206245795231637e-09, + "loss": 1.092, + "step": 3955 + }, + { + "epoch": 2.9941343424787132, + "grad_norm": 1.9083516597747803, + "learning_rate": 2.157907064203985e-09, + "loss": 1.0385, + "step": 3956 + }, + { + "epoch": 2.994891201513718, + "grad_norm": 2.315964460372925, + "learning_rate": 2.1160134343056167e-09, + "loss": 1.0794, + "step": 3957 + }, + { + "epoch": 2.9956480605487226, + "grad_norm": 2.072871685028076, + "learning_rate": 2.080564932537786e-09, + "loss": 1.0603, + "step": 3958 + }, + { + "epoch": 2.9964049195837275, + "grad_norm": 1.996877908706665, + "learning_rate": 2.0515615817510374e-09, + "loss": 1.066, + "step": 3959 + }, + { + "epoch": 2.9971617786187323, + "grad_norm": 2.1000583171844482, + "learning_rate": 2.0290034006407686e-09, + "loss": 1.0554, + "step": 3960 + }, + { + "epoch": 2.9979186376537372, + "grad_norm": 2.0836434364318848, + "learning_rate": 2.0128904037472256e-09, + "loss": 1.0769, + "step": 3961 + }, + { + "epoch": 2.9986754966887417, + "grad_norm": 1.999711275100708, + "learning_rate": 2.0032226014555062e-09, + "loss": 1.0731, + "step": 3962 + }, + { + "epoch": 2.9994323557237466, + "grad_norm": 2.6111867427825928, + "learning_rate": 2e-09, + "loss": 1.1002, + "step": 3963 + } + ], + "logging_steps": 1.0, + "max_steps": 3963, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.729010317829918e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}