diff --git "a/checkpoint-723/trainer_state.json" "b/checkpoint-723/trainer_state.json" deleted file mode 100644--- "a/checkpoint-723/trainer_state.json" +++ /dev/null @@ -1,5126 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 1.0, - "eval_steps": 181, - "global_step": 723, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0013831258644536654, - "grad_norm": 0.5777013897895813, - "learning_rate": 2.0000000000000003e-06, - "loss": 1.8618, - "step": 1 - }, - { - "epoch": 0.0013831258644536654, - "eval_loss": 1.4421294927597046, - "eval_runtime": 109.7132, - "eval_samples_per_second": 13.162, - "eval_steps_per_second": 0.829, - "step": 1 - }, - { - "epoch": 0.0027662517289073307, - "grad_norm": 0.6331229209899902, - "learning_rate": 4.000000000000001e-06, - "loss": 1.7491, - "step": 2 - }, - { - "epoch": 0.004149377593360996, - "grad_norm": 0.5830245018005371, - "learning_rate": 6e-06, - "loss": 1.8016, - "step": 3 - }, - { - "epoch": 0.005532503457814661, - "grad_norm": 0.5829038023948669, - "learning_rate": 8.000000000000001e-06, - "loss": 1.9693, - "step": 4 - }, - { - "epoch": 0.006915629322268326, - "grad_norm": 0.552374005317688, - "learning_rate": 1e-05, - "loss": 1.9286, - "step": 5 - }, - { - "epoch": 0.008298755186721992, - "grad_norm": 0.5776746869087219, - "learning_rate": 1.2e-05, - "loss": 1.7761, - "step": 6 - }, - { - "epoch": 0.009681881051175657, - "grad_norm": 0.5832110643386841, - "learning_rate": 1.4e-05, - "loss": 1.818, - "step": 7 - }, - { - "epoch": 0.011065006915629323, - "grad_norm": 0.5438941717147827, - "learning_rate": 1.6000000000000003e-05, - "loss": 1.863, - "step": 8 - }, - { - "epoch": 0.012448132780082987, - "grad_norm": 0.583209216594696, - "learning_rate": 1.8e-05, - "loss": 1.9574, - "step": 9 - }, - { - "epoch": 0.013831258644536652, - "grad_norm": 0.5854378938674927, - "learning_rate": 2e-05, - "loss": 1.9059, - "step": 10 - }, - { - "epoch": 0.015214384508990318, - "grad_norm": 0.49761173129081726, - "learning_rate": 1.999997606900827e-05, - "loss": 1.8309, - "step": 11 - }, - { - "epoch": 0.016597510373443983, - "grad_norm": 0.4992309808731079, - "learning_rate": 1.999990427614762e-05, - "loss": 1.7712, - "step": 12 - }, - { - "epoch": 0.017980636237897647, - "grad_norm": 0.49240681529045105, - "learning_rate": 1.999978462176166e-05, - "loss": 1.8119, - "step": 13 - }, - { - "epoch": 0.019363762102351315, - "grad_norm": 0.469261109828949, - "learning_rate": 1.999961710642308e-05, - "loss": 1.8405, - "step": 14 - }, - { - "epoch": 0.02074688796680498, - "grad_norm": 0.4924378991127014, - "learning_rate": 1.999940173093365e-05, - "loss": 1.7209, - "step": 15 - }, - { - "epoch": 0.022130013831258646, - "grad_norm": 0.5745378732681274, - "learning_rate": 1.999913849632419e-05, - "loss": 1.8454, - "step": 16 - }, - { - "epoch": 0.02351313969571231, - "grad_norm": 0.5211701989173889, - "learning_rate": 1.9998827403854596e-05, - "loss": 1.8306, - "step": 17 - }, - { - "epoch": 0.024896265560165973, - "grad_norm": 0.5535211563110352, - "learning_rate": 1.9998468455013825e-05, - "loss": 1.8723, - "step": 18 - }, - { - "epoch": 0.02627939142461964, - "grad_norm": 0.5848811268806458, - "learning_rate": 1.9998061651519868e-05, - "loss": 1.7475, - "step": 19 - }, - { - "epoch": 0.027662517289073305, - "grad_norm": 0.6234457492828369, - "learning_rate": 1.999760699531977e-05, - "loss": 1.7606, - "step": 20 - }, - { - "epoch": 0.029045643153526972, - "grad_norm": 0.559583842754364, - "learning_rate": 1.9997104488589607e-05, - "loss": 1.7085, - "step": 21 - }, - { - "epoch": 0.030428769017980636, - "grad_norm": 0.5410158038139343, - "learning_rate": 1.9996554133734473e-05, - "loss": 1.7188, - "step": 22 - }, - { - "epoch": 0.0318118948824343, - "grad_norm": 0.49627774953842163, - "learning_rate": 1.999595593338848e-05, - "loss": 1.8304, - "step": 23 - }, - { - "epoch": 0.03319502074688797, - "grad_norm": 0.49211692810058594, - "learning_rate": 1.9995309890414735e-05, - "loss": 1.9025, - "step": 24 - }, - { - "epoch": 0.034578146611341634, - "grad_norm": 0.5097021460533142, - "learning_rate": 1.9994616007905318e-05, - "loss": 1.8533, - "step": 25 - }, - { - "epoch": 0.035961272475795295, - "grad_norm": 0.4716247320175171, - "learning_rate": 1.99938742891813e-05, - "loss": 1.7933, - "step": 26 - }, - { - "epoch": 0.03734439834024896, - "grad_norm": 0.5546444058418274, - "learning_rate": 1.9993084737792687e-05, - "loss": 1.7098, - "step": 27 - }, - { - "epoch": 0.03872752420470263, - "grad_norm": 0.6025681495666504, - "learning_rate": 1.9992247357518428e-05, - "loss": 1.8161, - "step": 28 - }, - { - "epoch": 0.040110650069156296, - "grad_norm": 0.5782088041305542, - "learning_rate": 1.9991362152366393e-05, - "loss": 1.7695, - "step": 29 - }, - { - "epoch": 0.04149377593360996, - "grad_norm": 0.5509287714958191, - "learning_rate": 1.9990429126573353e-05, - "loss": 1.8172, - "step": 30 - }, - { - "epoch": 0.042876901798063624, - "grad_norm": 0.504231333732605, - "learning_rate": 1.9989448284604947e-05, - "loss": 1.8554, - "step": 31 - }, - { - "epoch": 0.04426002766251729, - "grad_norm": 0.5206706523895264, - "learning_rate": 1.9988419631155686e-05, - "loss": 1.7808, - "step": 32 - }, - { - "epoch": 0.04564315352697095, - "grad_norm": 0.49199178814888, - "learning_rate": 1.9987343171148904e-05, - "loss": 1.7783, - "step": 33 - }, - { - "epoch": 0.04702627939142462, - "grad_norm": 0.5891382694244385, - "learning_rate": 1.9986218909736758e-05, - "loss": 1.7706, - "step": 34 - }, - { - "epoch": 0.048409405255878286, - "grad_norm": 0.5374609231948853, - "learning_rate": 1.9985046852300183e-05, - "loss": 1.7426, - "step": 35 - }, - { - "epoch": 0.04979253112033195, - "grad_norm": 0.5391809344291687, - "learning_rate": 1.9983827004448875e-05, - "loss": 1.7105, - "step": 36 - }, - { - "epoch": 0.051175656984785614, - "grad_norm": 0.5461614727973938, - "learning_rate": 1.9982559372021274e-05, - "loss": 1.694, - "step": 37 - }, - { - "epoch": 0.05255878284923928, - "grad_norm": 0.531283974647522, - "learning_rate": 1.9981243961084516e-05, - "loss": 1.7295, - "step": 38 - }, - { - "epoch": 0.05394190871369295, - "grad_norm": 0.512266218662262, - "learning_rate": 1.997988077793442e-05, - "loss": 1.6836, - "step": 39 - }, - { - "epoch": 0.05532503457814661, - "grad_norm": 0.5940636396408081, - "learning_rate": 1.997846982909545e-05, - "loss": 1.7573, - "step": 40 - }, - { - "epoch": 0.056708160442600276, - "grad_norm": 0.5649189352989197, - "learning_rate": 1.9977011121320687e-05, - "loss": 1.6611, - "step": 41 - }, - { - "epoch": 0.058091286307053944, - "grad_norm": 0.5476697087287903, - "learning_rate": 1.99755046615918e-05, - "loss": 1.6048, - "step": 42 - }, - { - "epoch": 0.059474412171507604, - "grad_norm": 0.5268926024436951, - "learning_rate": 1.9973950457119e-05, - "loss": 1.6819, - "step": 43 - }, - { - "epoch": 0.06085753803596127, - "grad_norm": 0.5020188689231873, - "learning_rate": 1.9972348515341018e-05, - "loss": 1.8042, - "step": 44 - }, - { - "epoch": 0.06224066390041494, - "grad_norm": 0.5357793569564819, - "learning_rate": 1.9970698843925064e-05, - "loss": 1.799, - "step": 45 - }, - { - "epoch": 0.0636237897648686, - "grad_norm": 0.5506536364555359, - "learning_rate": 1.9969001450766795e-05, - "loss": 1.6811, - "step": 46 - }, - { - "epoch": 0.06500691562932227, - "grad_norm": 0.5344098806381226, - "learning_rate": 1.9967256343990272e-05, - "loss": 1.6994, - "step": 47 - }, - { - "epoch": 0.06639004149377593, - "grad_norm": 0.6598169803619385, - "learning_rate": 1.996546353194792e-05, - "loss": 1.6529, - "step": 48 - }, - { - "epoch": 0.0677731673582296, - "grad_norm": 0.5558203458786011, - "learning_rate": 1.9963623023220493e-05, - "loss": 1.6161, - "step": 49 - }, - { - "epoch": 0.06915629322268327, - "grad_norm": 0.5793808102607727, - "learning_rate": 1.9961734826617033e-05, - "loss": 1.3158, - "step": 50 - }, - { - "epoch": 0.07053941908713693, - "grad_norm": 0.5689916610717773, - "learning_rate": 1.995979895117482e-05, - "loss": 1.5892, - "step": 51 - }, - { - "epoch": 0.07192254495159059, - "grad_norm": 0.5733948349952698, - "learning_rate": 1.9957815406159344e-05, - "loss": 1.6697, - "step": 52 - }, - { - "epoch": 0.07330567081604426, - "grad_norm": 0.5705085396766663, - "learning_rate": 1.995578420106424e-05, - "loss": 1.6955, - "step": 53 - }, - { - "epoch": 0.07468879668049792, - "grad_norm": 0.6449640393257141, - "learning_rate": 1.995370534561125e-05, - "loss": 1.6568, - "step": 54 - }, - { - "epoch": 0.07607192254495158, - "grad_norm": 0.7000141739845276, - "learning_rate": 1.995157884975021e-05, - "loss": 1.6543, - "step": 55 - }, - { - "epoch": 0.07745504840940526, - "grad_norm": 0.725715696811676, - "learning_rate": 1.994940472365893e-05, - "loss": 1.5375, - "step": 56 - }, - { - "epoch": 0.07883817427385892, - "grad_norm": 0.616287350654602, - "learning_rate": 1.994718297774322e-05, - "loss": 1.7027, - "step": 57 - }, - { - "epoch": 0.08022130013831259, - "grad_norm": 0.561693012714386, - "learning_rate": 1.9944913622636798e-05, - "loss": 1.7051, - "step": 58 - }, - { - "epoch": 0.08160442600276625, - "grad_norm": 0.6091241240501404, - "learning_rate": 1.994259666920124e-05, - "loss": 1.6528, - "step": 59 - }, - { - "epoch": 0.08298755186721991, - "grad_norm": 0.6698527932167053, - "learning_rate": 1.994023212852595e-05, - "loss": 1.6039, - "step": 60 - }, - { - "epoch": 0.08437067773167359, - "grad_norm": 0.6914900541305542, - "learning_rate": 1.9937820011928086e-05, - "loss": 1.4681, - "step": 61 - }, - { - "epoch": 0.08575380359612725, - "grad_norm": 0.6026729345321655, - "learning_rate": 1.993536033095252e-05, - "loss": 1.6564, - "step": 62 - }, - { - "epoch": 0.08713692946058091, - "grad_norm": 0.6261776089668274, - "learning_rate": 1.9932853097371765e-05, - "loss": 1.7278, - "step": 63 - }, - { - "epoch": 0.08852005532503458, - "grad_norm": 0.6159116625785828, - "learning_rate": 1.9930298323185945e-05, - "loss": 1.604, - "step": 64 - }, - { - "epoch": 0.08990318118948824, - "grad_norm": 0.633053719997406, - "learning_rate": 1.992769602062272e-05, - "loss": 1.7534, - "step": 65 - }, - { - "epoch": 0.0912863070539419, - "grad_norm": 0.6861281394958496, - "learning_rate": 1.9925046202137215e-05, - "loss": 1.7307, - "step": 66 - }, - { - "epoch": 0.09266943291839558, - "grad_norm": 0.6718122959136963, - "learning_rate": 1.9922348880411997e-05, - "loss": 1.6336, - "step": 67 - }, - { - "epoch": 0.09405255878284924, - "grad_norm": 0.6402599811553955, - "learning_rate": 1.9919604068356978e-05, - "loss": 1.6736, - "step": 68 - }, - { - "epoch": 0.0954356846473029, - "grad_norm": 0.6972026824951172, - "learning_rate": 1.9916811779109374e-05, - "loss": 1.6701, - "step": 69 - }, - { - "epoch": 0.09681881051175657, - "grad_norm": 0.6567057967185974, - "learning_rate": 1.991397202603363e-05, - "loss": 1.716, - "step": 70 - }, - { - "epoch": 0.09820193637621023, - "grad_norm": 0.705405592918396, - "learning_rate": 1.991108482272138e-05, - "loss": 1.6328, - "step": 71 - }, - { - "epoch": 0.0995850622406639, - "grad_norm": 0.6641591787338257, - "learning_rate": 1.9908150182991338e-05, - "loss": 1.71, - "step": 72 - }, - { - "epoch": 0.10096818810511757, - "grad_norm": 0.7807027101516724, - "learning_rate": 1.990516812088928e-05, - "loss": 1.6361, - "step": 73 - }, - { - "epoch": 0.10235131396957123, - "grad_norm": 0.7211569547653198, - "learning_rate": 1.9902138650687943e-05, - "loss": 1.5969, - "step": 74 - }, - { - "epoch": 0.1037344398340249, - "grad_norm": 0.6972200870513916, - "learning_rate": 1.9899061786886978e-05, - "loss": 1.6331, - "step": 75 - }, - { - "epoch": 0.10511756569847856, - "grad_norm": 0.722425103187561, - "learning_rate": 1.9895937544212856e-05, - "loss": 1.6524, - "step": 76 - }, - { - "epoch": 0.10650069156293222, - "grad_norm": 0.8138152360916138, - "learning_rate": 1.9892765937618826e-05, - "loss": 1.6209, - "step": 77 - }, - { - "epoch": 0.1078838174273859, - "grad_norm": 0.6803196668624878, - "learning_rate": 1.9889546982284833e-05, - "loss": 1.6756, - "step": 78 - }, - { - "epoch": 0.10926694329183956, - "grad_norm": 0.7364890575408936, - "learning_rate": 1.988628069361743e-05, - "loss": 1.5753, - "step": 79 - }, - { - "epoch": 0.11065006915629322, - "grad_norm": 0.6657965183258057, - "learning_rate": 1.988296708724972e-05, - "loss": 1.62, - "step": 80 - }, - { - "epoch": 0.11203319502074689, - "grad_norm": 0.709164559841156, - "learning_rate": 1.9879606179041283e-05, - "loss": 1.7221, - "step": 81 - }, - { - "epoch": 0.11341632088520055, - "grad_norm": 0.6902270317077637, - "learning_rate": 1.987619798507809e-05, - "loss": 1.7295, - "step": 82 - }, - { - "epoch": 0.11479944674965421, - "grad_norm": 0.6978942155838013, - "learning_rate": 1.987274252167244e-05, - "loss": 1.7619, - "step": 83 - }, - { - "epoch": 0.11618257261410789, - "grad_norm": 0.6876259446144104, - "learning_rate": 1.986923980536286e-05, - "loss": 1.6597, - "step": 84 - }, - { - "epoch": 0.11756569847856155, - "grad_norm": 0.7877823114395142, - "learning_rate": 1.9865689852914048e-05, - "loss": 1.5334, - "step": 85 - }, - { - "epoch": 0.11894882434301521, - "grad_norm": 0.748802125453949, - "learning_rate": 1.9862092681316774e-05, - "loss": 1.6225, - "step": 86 - }, - { - "epoch": 0.12033195020746888, - "grad_norm": 0.8329505920410156, - "learning_rate": 1.9858448307787827e-05, - "loss": 1.5343, - "step": 87 - }, - { - "epoch": 0.12171507607192254, - "grad_norm": 0.8424030542373657, - "learning_rate": 1.9854756749769893e-05, - "loss": 1.6712, - "step": 88 - }, - { - "epoch": 0.12309820193637622, - "grad_norm": 0.7446318864822388, - "learning_rate": 1.98510180249315e-05, - "loss": 1.7452, - "step": 89 - }, - { - "epoch": 0.12448132780082988, - "grad_norm": 0.7498570680618286, - "learning_rate": 1.984723215116693e-05, - "loss": 1.5573, - "step": 90 - }, - { - "epoch": 0.12586445366528354, - "grad_norm": 0.7254923582077026, - "learning_rate": 1.9843399146596125e-05, - "loss": 1.6581, - "step": 91 - }, - { - "epoch": 0.1272475795297372, - "grad_norm": 0.8855713605880737, - "learning_rate": 1.9839519029564608e-05, - "loss": 1.5751, - "step": 92 - }, - { - "epoch": 0.12863070539419086, - "grad_norm": 0.7330020070075989, - "learning_rate": 1.983559181864338e-05, - "loss": 1.5733, - "step": 93 - }, - { - "epoch": 0.13001383125864455, - "grad_norm": 0.7649924755096436, - "learning_rate": 1.983161753262886e-05, - "loss": 1.7353, - "step": 94 - }, - { - "epoch": 0.1313969571230982, - "grad_norm": 0.7761242985725403, - "learning_rate": 1.982759619054277e-05, - "loss": 1.6374, - "step": 95 - }, - { - "epoch": 0.13278008298755187, - "grad_norm": 0.8000247478485107, - "learning_rate": 1.982352781163204e-05, - "loss": 1.6466, - "step": 96 - }, - { - "epoch": 0.13416320885200553, - "grad_norm": 0.7133256196975708, - "learning_rate": 1.9819412415368753e-05, - "loss": 1.5993, - "step": 97 - }, - { - "epoch": 0.1355463347164592, - "grad_norm": 0.781935453414917, - "learning_rate": 1.9815250021449998e-05, - "loss": 1.5487, - "step": 98 - }, - { - "epoch": 0.13692946058091288, - "grad_norm": 0.7965666055679321, - "learning_rate": 1.981104064979783e-05, - "loss": 1.5498, - "step": 99 - }, - { - "epoch": 0.13831258644536654, - "grad_norm": 0.8179916143417358, - "learning_rate": 1.980678432055913e-05, - "loss": 1.6324, - "step": 100 - }, - { - "epoch": 0.1396957123098202, - "grad_norm": 0.8549492955207825, - "learning_rate": 1.9802481054105527e-05, - "loss": 1.6359, - "step": 101 - }, - { - "epoch": 0.14107883817427386, - "grad_norm": 0.8777151107788086, - "learning_rate": 1.9798130871033322e-05, - "loss": 1.5975, - "step": 102 - }, - { - "epoch": 0.14246196403872752, - "grad_norm": 1.004165530204773, - "learning_rate": 1.979373379216335e-05, - "loss": 1.623, - "step": 103 - }, - { - "epoch": 0.14384508990318118, - "grad_norm": 0.9654561877250671, - "learning_rate": 1.9789289838540897e-05, - "loss": 1.5919, - "step": 104 - }, - { - "epoch": 0.14522821576763487, - "grad_norm": 0.8471197485923767, - "learning_rate": 1.978479903143561e-05, - "loss": 1.6244, - "step": 105 - }, - { - "epoch": 0.14661134163208853, - "grad_norm": 0.813434362411499, - "learning_rate": 1.9780261392341383e-05, - "loss": 1.6015, - "step": 106 - }, - { - "epoch": 0.1479944674965422, - "grad_norm": 0.9421816468238831, - "learning_rate": 1.9775676942976253e-05, - "loss": 1.6022, - "step": 107 - }, - { - "epoch": 0.14937759336099585, - "grad_norm": 0.7814284563064575, - "learning_rate": 1.9771045705282313e-05, - "loss": 1.6354, - "step": 108 - }, - { - "epoch": 0.1507607192254495, - "grad_norm": 0.9225528240203857, - "learning_rate": 1.9766367701425575e-05, - "loss": 1.6794, - "step": 109 - }, - { - "epoch": 0.15214384508990317, - "grad_norm": 0.9190025329589844, - "learning_rate": 1.9761642953795896e-05, - "loss": 1.511, - "step": 110 - }, - { - "epoch": 0.15352697095435686, - "grad_norm": 0.857140302658081, - "learning_rate": 1.9756871485006862e-05, - "loss": 1.5262, - "step": 111 - }, - { - "epoch": 0.15491009681881052, - "grad_norm": 0.7790897488594055, - "learning_rate": 1.975205331789566e-05, - "loss": 1.2074, - "step": 112 - }, - { - "epoch": 0.15629322268326418, - "grad_norm": 0.870851993560791, - "learning_rate": 1.9747188475523e-05, - "loss": 1.6356, - "step": 113 - }, - { - "epoch": 0.15767634854771784, - "grad_norm": 0.9374619722366333, - "learning_rate": 1.9742276981172978e-05, - "loss": 1.5746, - "step": 114 - }, - { - "epoch": 0.1590594744121715, - "grad_norm": 0.8879979252815247, - "learning_rate": 1.973731885835298e-05, - "loss": 1.5728, - "step": 115 - }, - { - "epoch": 0.16044260027662519, - "grad_norm": 0.8657084107398987, - "learning_rate": 1.973231413079357e-05, - "loss": 1.6845, - "step": 116 - }, - { - "epoch": 0.16182572614107885, - "grad_norm": 1.0091602802276611, - "learning_rate": 1.9727262822448364e-05, - "loss": 1.5589, - "step": 117 - }, - { - "epoch": 0.1632088520055325, - "grad_norm": 0.8246741890907288, - "learning_rate": 1.9722164957493925e-05, - "loss": 1.6564, - "step": 118 - }, - { - "epoch": 0.16459197786998617, - "grad_norm": 0.8533387780189514, - "learning_rate": 1.9717020560329644e-05, - "loss": 1.5895, - "step": 119 - }, - { - "epoch": 0.16597510373443983, - "grad_norm": 0.8713412284851074, - "learning_rate": 1.971182965557763e-05, - "loss": 1.6223, - "step": 120 - }, - { - "epoch": 0.1673582295988935, - "grad_norm": 0.948832631111145, - "learning_rate": 1.9706592268082584e-05, - "loss": 1.5907, - "step": 121 - }, - { - "epoch": 0.16874135546334718, - "grad_norm": 0.9240509867668152, - "learning_rate": 1.9701308422911674e-05, - "loss": 1.6396, - "step": 122 - }, - { - "epoch": 0.17012448132780084, - "grad_norm": 0.8856998085975647, - "learning_rate": 1.969597814535444e-05, - "loss": 1.5642, - "step": 123 - }, - { - "epoch": 0.1715076071922545, - "grad_norm": 1.0024968385696411, - "learning_rate": 1.969060146092264e-05, - "loss": 1.5414, - "step": 124 - }, - { - "epoch": 0.17289073305670816, - "grad_norm": 1.0026429891586304, - "learning_rate": 1.9685178395350157e-05, - "loss": 1.4907, - "step": 125 - }, - { - "epoch": 0.17427385892116182, - "grad_norm": 1.0451340675354004, - "learning_rate": 1.967970897459286e-05, - "loss": 1.6097, - "step": 126 - }, - { - "epoch": 0.17565698478561548, - "grad_norm": 0.9511062502861023, - "learning_rate": 1.9674193224828477e-05, - "loss": 1.5482, - "step": 127 - }, - { - "epoch": 0.17704011065006917, - "grad_norm": 0.9325770139694214, - "learning_rate": 1.966863117245648e-05, - "loss": 1.6144, - "step": 128 - }, - { - "epoch": 0.17842323651452283, - "grad_norm": 0.9411192536354065, - "learning_rate": 1.9663022844097956e-05, - "loss": 1.6372, - "step": 129 - }, - { - "epoch": 0.1798063623789765, - "grad_norm": 0.9200664758682251, - "learning_rate": 1.9657368266595477e-05, - "loss": 1.6128, - "step": 130 - }, - { - "epoch": 0.18118948824343015, - "grad_norm": 1.013554334640503, - "learning_rate": 1.9651667467012977e-05, - "loss": 1.4844, - "step": 131 - }, - { - "epoch": 0.1825726141078838, - "grad_norm": 0.9959779381752014, - "learning_rate": 1.964592047263561e-05, - "loss": 1.5979, - "step": 132 - }, - { - "epoch": 0.1839557399723375, - "grad_norm": 0.8918667435646057, - "learning_rate": 1.9640127310969626e-05, - "loss": 1.6407, - "step": 133 - }, - { - "epoch": 0.18533886583679116, - "grad_norm": 0.9420216083526611, - "learning_rate": 1.9634288009742254e-05, - "loss": 1.6495, - "step": 134 - }, - { - "epoch": 0.18672199170124482, - "grad_norm": 1.0406219959259033, - "learning_rate": 1.9628402596901545e-05, - "loss": 1.6287, - "step": 135 - }, - { - "epoch": 0.18810511756569848, - "grad_norm": 0.9881539940834045, - "learning_rate": 1.9622471100616253e-05, - "loss": 1.6145, - "step": 136 - }, - { - "epoch": 0.18948824343015214, - "grad_norm": 0.9983912706375122, - "learning_rate": 1.961649354927569e-05, - "loss": 1.5708, - "step": 137 - }, - { - "epoch": 0.1908713692946058, - "grad_norm": 0.9389325380325317, - "learning_rate": 1.961046997148961e-05, - "loss": 1.654, - "step": 138 - }, - { - "epoch": 0.19225449515905949, - "grad_norm": 0.9617229104042053, - "learning_rate": 1.9604400396088047e-05, - "loss": 1.5198, - "step": 139 - }, - { - "epoch": 0.19363762102351315, - "grad_norm": 0.9675804972648621, - "learning_rate": 1.959828485212119e-05, - "loss": 1.635, - "step": 140 - }, - { - "epoch": 0.1950207468879668, - "grad_norm": 1.0724852085113525, - "learning_rate": 1.959212336885925e-05, - "loss": 1.4736, - "step": 141 - }, - { - "epoch": 0.19640387275242047, - "grad_norm": 1.0052822828292847, - "learning_rate": 1.958591597579231e-05, - "loss": 1.5661, - "step": 142 - }, - { - "epoch": 0.19778699861687413, - "grad_norm": 0.890238881111145, - "learning_rate": 1.957966270263018e-05, - "loss": 1.252, - "step": 143 - }, - { - "epoch": 0.1991701244813278, - "grad_norm": 0.9959819912910461, - "learning_rate": 1.957336357930227e-05, - "loss": 1.4612, - "step": 144 - }, - { - "epoch": 0.20055325034578148, - "grad_norm": 1.03453528881073, - "learning_rate": 1.9567018635957426e-05, - "loss": 1.4836, - "step": 145 - }, - { - "epoch": 0.20193637621023514, - "grad_norm": 1.0642746686935425, - "learning_rate": 1.9560627902963808e-05, - "loss": 1.5676, - "step": 146 - }, - { - "epoch": 0.2033195020746888, - "grad_norm": 1.0111668109893799, - "learning_rate": 1.955419141090874e-05, - "loss": 1.5925, - "step": 147 - }, - { - "epoch": 0.20470262793914246, - "grad_norm": 1.0494191646575928, - "learning_rate": 1.9547709190598538e-05, - "loss": 1.6079, - "step": 148 - }, - { - "epoch": 0.20608575380359612, - "grad_norm": 0.9530560970306396, - "learning_rate": 1.95411812730584e-05, - "loss": 1.5284, - "step": 149 - }, - { - "epoch": 0.2074688796680498, - "grad_norm": 1.0679057836532593, - "learning_rate": 1.9534607689532236e-05, - "loss": 1.515, - "step": 150 - }, - { - "epoch": 0.20885200553250347, - "grad_norm": 1.0620648860931396, - "learning_rate": 1.9527988471482517e-05, - "loss": 1.6281, - "step": 151 - }, - { - "epoch": 0.21023513139695713, - "grad_norm": 0.9668426513671875, - "learning_rate": 1.9521323650590135e-05, - "loss": 1.5903, - "step": 152 - }, - { - "epoch": 0.21161825726141079, - "grad_norm": 1.0088167190551758, - "learning_rate": 1.9514613258754244e-05, - "loss": 1.5094, - "step": 153 - }, - { - "epoch": 0.21300138312586445, - "grad_norm": 1.113911747932434, - "learning_rate": 1.950785732809211e-05, - "loss": 1.5212, - "step": 154 - }, - { - "epoch": 0.2143845089903181, - "grad_norm": 0.95964515209198, - "learning_rate": 1.9501055890938957e-05, - "loss": 1.5151, - "step": 155 - }, - { - "epoch": 0.2157676348547718, - "grad_norm": 0.978797972202301, - "learning_rate": 1.9494208979847814e-05, - "loss": 1.5552, - "step": 156 - }, - { - "epoch": 0.21715076071922546, - "grad_norm": 1.2046525478363037, - "learning_rate": 1.9487316627589353e-05, - "loss": 1.6375, - "step": 157 - }, - { - "epoch": 0.21853388658367912, - "grad_norm": 1.1309069395065308, - "learning_rate": 1.9480378867151746e-05, - "loss": 1.5061, - "step": 158 - }, - { - "epoch": 0.21991701244813278, - "grad_norm": 1.0797772407531738, - "learning_rate": 1.9473395731740483e-05, - "loss": 1.4682, - "step": 159 - }, - { - "epoch": 0.22130013831258644, - "grad_norm": 1.1034504175186157, - "learning_rate": 1.9466367254778234e-05, - "loss": 1.5552, - "step": 160 - }, - { - "epoch": 0.22268326417704012, - "grad_norm": 1.03969407081604, - "learning_rate": 1.945929346990469e-05, - "loss": 1.6028, - "step": 161 - }, - { - "epoch": 0.22406639004149378, - "grad_norm": 1.2426897287368774, - "learning_rate": 1.9452174410976383e-05, - "loss": 1.5764, - "step": 162 - }, - { - "epoch": 0.22544951590594745, - "grad_norm": 1.0647516250610352, - "learning_rate": 1.9445010112066543e-05, - "loss": 1.5122, - "step": 163 - }, - { - "epoch": 0.2268326417704011, - "grad_norm": 0.976096510887146, - "learning_rate": 1.943780060746493e-05, - "loss": 1.6876, - "step": 164 - }, - { - "epoch": 0.22821576763485477, - "grad_norm": 1.1224089860916138, - "learning_rate": 1.9430545931677657e-05, - "loss": 1.5798, - "step": 165 - }, - { - "epoch": 0.22959889349930843, - "grad_norm": 1.1245198249816895, - "learning_rate": 1.9423246119427044e-05, - "loss": 1.5093, - "step": 166 - }, - { - "epoch": 0.23098201936376211, - "grad_norm": 1.0290110111236572, - "learning_rate": 1.941590120565144e-05, - "loss": 1.5308, - "step": 167 - }, - { - "epoch": 0.23236514522821577, - "grad_norm": 1.0023036003112793, - "learning_rate": 1.940851122550506e-05, - "loss": 1.6366, - "step": 168 - }, - { - "epoch": 0.23374827109266944, - "grad_norm": 1.287907361984253, - "learning_rate": 1.940107621435781e-05, - "loss": 1.5091, - "step": 169 - }, - { - "epoch": 0.2351313969571231, - "grad_norm": 1.159561276435852, - "learning_rate": 1.9393596207795135e-05, - "loss": 1.3876, - "step": 170 - }, - { - "epoch": 0.23651452282157676, - "grad_norm": 1.150119662284851, - "learning_rate": 1.9386071241617827e-05, - "loss": 1.5695, - "step": 171 - }, - { - "epoch": 0.23789764868603042, - "grad_norm": 1.1534556150436401, - "learning_rate": 1.9378501351841864e-05, - "loss": 1.4763, - "step": 172 - }, - { - "epoch": 0.2392807745504841, - "grad_norm": 1.2043812274932861, - "learning_rate": 1.9370886574698244e-05, - "loss": 1.5643, - "step": 173 - }, - { - "epoch": 0.24066390041493776, - "grad_norm": 1.049975872039795, - "learning_rate": 1.93632269466328e-05, - "loss": 1.6063, - "step": 174 - }, - { - "epoch": 0.24204702627939143, - "grad_norm": 1.168745517730713, - "learning_rate": 1.9355522504306027e-05, - "loss": 1.5692, - "step": 175 - }, - { - "epoch": 0.24343015214384509, - "grad_norm": 1.119295597076416, - "learning_rate": 1.934777328459292e-05, - "loss": 1.4857, - "step": 176 - }, - { - "epoch": 0.24481327800829875, - "grad_norm": 1.1553229093551636, - "learning_rate": 1.9339979324582782e-05, - "loss": 1.4596, - "step": 177 - }, - { - "epoch": 0.24619640387275243, - "grad_norm": 1.2437880039215088, - "learning_rate": 1.933214066157904e-05, - "loss": 1.601, - "step": 178 - }, - { - "epoch": 0.2475795297372061, - "grad_norm": 1.1771876811981201, - "learning_rate": 1.9324257333099104e-05, - "loss": 1.5632, - "step": 179 - }, - { - "epoch": 0.24896265560165975, - "grad_norm": 1.140103816986084, - "learning_rate": 1.9316329376874146e-05, - "loss": 1.5909, - "step": 180 - }, - { - "epoch": 0.2503457814661134, - "grad_norm": 1.1156872510910034, - "learning_rate": 1.9308356830848925e-05, - "loss": 1.6356, - "step": 181 - }, - { - "epoch": 0.2503457814661134, - "eval_loss": 1.163191318511963, - "eval_runtime": 108.8957, - "eval_samples_per_second": 13.26, - "eval_steps_per_second": 0.836, - "step": 181 - }, - { - "epoch": 0.2517289073305671, - "grad_norm": 1.1111009120941162, - "learning_rate": 1.930033973318164e-05, - "loss": 1.4833, - "step": 182 - }, - { - "epoch": 0.25311203319502074, - "grad_norm": 1.1635496616363525, - "learning_rate": 1.9292278122243705e-05, - "loss": 1.5605, - "step": 183 - }, - { - "epoch": 0.2544951590594744, - "grad_norm": 1.1545699834823608, - "learning_rate": 1.9284172036619597e-05, - "loss": 1.6157, - "step": 184 - }, - { - "epoch": 0.25587828492392806, - "grad_norm": 1.1379010677337646, - "learning_rate": 1.9276021515106635e-05, - "loss": 1.4126, - "step": 185 - }, - { - "epoch": 0.2572614107883817, - "grad_norm": 1.153485894203186, - "learning_rate": 1.926782659671484e-05, - "loss": 1.5618, - "step": 186 - }, - { - "epoch": 0.25864453665283543, - "grad_norm": 1.1652241945266724, - "learning_rate": 1.925958732066672e-05, - "loss": 1.6186, - "step": 187 - }, - { - "epoch": 0.2600276625172891, - "grad_norm": 1.1322290897369385, - "learning_rate": 1.9251303726397076e-05, - "loss": 1.5684, - "step": 188 - }, - { - "epoch": 0.26141078838174275, - "grad_norm": 1.0313161611557007, - "learning_rate": 1.924297585355284e-05, - "loss": 1.5584, - "step": 189 - }, - { - "epoch": 0.2627939142461964, - "grad_norm": 1.2757902145385742, - "learning_rate": 1.9234603741992864e-05, - "loss": 1.5329, - "step": 190 - }, - { - "epoch": 0.2641770401106501, - "grad_norm": 1.0278979539871216, - "learning_rate": 1.9226187431787727e-05, - "loss": 1.5397, - "step": 191 - }, - { - "epoch": 0.26556016597510373, - "grad_norm": 1.227102518081665, - "learning_rate": 1.9217726963219567e-05, - "loss": 1.4856, - "step": 192 - }, - { - "epoch": 0.2669432918395574, - "grad_norm": 1.2148761749267578, - "learning_rate": 1.9209222376781864e-05, - "loss": 1.511, - "step": 193 - }, - { - "epoch": 0.26832641770401106, - "grad_norm": 1.1404523849487305, - "learning_rate": 1.9200673713179245e-05, - "loss": 1.6732, - "step": 194 - }, - { - "epoch": 0.2697095435684647, - "grad_norm": 1.3106427192687988, - "learning_rate": 1.9192081013327325e-05, - "loss": 1.4351, - "step": 195 - }, - { - "epoch": 0.2710926694329184, - "grad_norm": 1.1526203155517578, - "learning_rate": 1.9183444318352458e-05, - "loss": 1.5936, - "step": 196 - }, - { - "epoch": 0.27247579529737204, - "grad_norm": 1.1274484395980835, - "learning_rate": 1.9174763669591583e-05, - "loss": 1.5429, - "step": 197 - }, - { - "epoch": 0.27385892116182575, - "grad_norm": 1.1309480667114258, - "learning_rate": 1.9166039108592008e-05, - "loss": 1.5481, - "step": 198 - }, - { - "epoch": 0.2752420470262794, - "grad_norm": 1.1612128019332886, - "learning_rate": 1.9157270677111214e-05, - "loss": 1.4978, - "step": 199 - }, - { - "epoch": 0.2766251728907331, - "grad_norm": 1.1628715991973877, - "learning_rate": 1.9148458417116645e-05, - "loss": 1.5816, - "step": 200 - }, - { - "epoch": 0.27800829875518673, - "grad_norm": 1.3227707147598267, - "learning_rate": 1.9139602370785536e-05, - "loss": 1.4715, - "step": 201 - }, - { - "epoch": 0.2793914246196404, - "grad_norm": 1.2166869640350342, - "learning_rate": 1.9130702580504678e-05, - "loss": 1.6047, - "step": 202 - }, - { - "epoch": 0.28077455048409405, - "grad_norm": 1.0572668313980103, - "learning_rate": 1.9121759088870228e-05, - "loss": 1.6119, - "step": 203 - }, - { - "epoch": 0.2821576763485477, - "grad_norm": 1.2285197973251343, - "learning_rate": 1.911277193868751e-05, - "loss": 1.558, - "step": 204 - }, - { - "epoch": 0.2835408022130014, - "grad_norm": 1.1655161380767822, - "learning_rate": 1.9103741172970816e-05, - "loss": 1.5855, - "step": 205 - }, - { - "epoch": 0.28492392807745504, - "grad_norm": 1.232515573501587, - "learning_rate": 1.9094666834943177e-05, - "loss": 1.5637, - "step": 206 - }, - { - "epoch": 0.2863070539419087, - "grad_norm": 1.2622716426849365, - "learning_rate": 1.9085548968036174e-05, - "loss": 1.4948, - "step": 207 - }, - { - "epoch": 0.28769017980636236, - "grad_norm": 1.2562229633331299, - "learning_rate": 1.9076387615889728e-05, - "loss": 1.5645, - "step": 208 - }, - { - "epoch": 0.28907330567081607, - "grad_norm": 1.1468416452407837, - "learning_rate": 1.9067182822351884e-05, - "loss": 1.6337, - "step": 209 - }, - { - "epoch": 0.29045643153526973, - "grad_norm": 1.250429391860962, - "learning_rate": 1.9057934631478616e-05, - "loss": 1.5544, - "step": 210 - }, - { - "epoch": 0.2918395573997234, - "grad_norm": 1.1409355401992798, - "learning_rate": 1.9048643087533594e-05, - "loss": 1.5368, - "step": 211 - }, - { - "epoch": 0.29322268326417705, - "grad_norm": 1.1958142518997192, - "learning_rate": 1.903930823498799e-05, - "loss": 1.5344, - "step": 212 - }, - { - "epoch": 0.2946058091286307, - "grad_norm": 1.1556063890457153, - "learning_rate": 1.9029930118520266e-05, - "loss": 1.591, - "step": 213 - }, - { - "epoch": 0.2959889349930844, - "grad_norm": 1.1471951007843018, - "learning_rate": 1.9020508783015942e-05, - "loss": 1.5892, - "step": 214 - }, - { - "epoch": 0.29737206085753803, - "grad_norm": 1.114162802696228, - "learning_rate": 1.9011044273567405e-05, - "loss": 1.4876, - "step": 215 - }, - { - "epoch": 0.2987551867219917, - "grad_norm": 1.2788852453231812, - "learning_rate": 1.9001536635473664e-05, - "loss": 1.5746, - "step": 216 - }, - { - "epoch": 0.30013831258644535, - "grad_norm": 1.2140027284622192, - "learning_rate": 1.8991985914240166e-05, - "loss": 1.5842, - "step": 217 - }, - { - "epoch": 0.301521438450899, - "grad_norm": 1.3167227506637573, - "learning_rate": 1.898239215557856e-05, - "loss": 1.5455, - "step": 218 - }, - { - "epoch": 0.3029045643153527, - "grad_norm": 1.22543466091156, - "learning_rate": 1.8972755405406475e-05, - "loss": 1.5469, - "step": 219 - }, - { - "epoch": 0.30428769017980634, - "grad_norm": 1.2558698654174805, - "learning_rate": 1.8963075709847308e-05, - "loss": 1.4874, - "step": 220 - }, - { - "epoch": 0.30567081604426005, - "grad_norm": 1.1993651390075684, - "learning_rate": 1.8953353115230005e-05, - "loss": 1.5977, - "step": 221 - }, - { - "epoch": 0.3070539419087137, - "grad_norm": 1.422924280166626, - "learning_rate": 1.894358766808883e-05, - "loss": 1.371, - "step": 222 - }, - { - "epoch": 0.3084370677731674, - "grad_norm": 1.2783629894256592, - "learning_rate": 1.893377941516315e-05, - "loss": 1.536, - "step": 223 - }, - { - "epoch": 0.30982019363762103, - "grad_norm": 1.2318593263626099, - "learning_rate": 1.892392840339721e-05, - "loss": 1.6051, - "step": 224 - }, - { - "epoch": 0.3112033195020747, - "grad_norm": 1.2269023656845093, - "learning_rate": 1.8914034679939905e-05, - "loss": 1.4488, - "step": 225 - }, - { - "epoch": 0.31258644536652835, - "grad_norm": 1.2582275867462158, - "learning_rate": 1.8904098292144556e-05, - "loss": 1.4979, - "step": 226 - }, - { - "epoch": 0.313969571230982, - "grad_norm": 1.4117553234100342, - "learning_rate": 1.889411928756869e-05, - "loss": 1.5858, - "step": 227 - }, - { - "epoch": 0.3153526970954357, - "grad_norm": 1.472165584564209, - "learning_rate": 1.8884097713973798e-05, - "loss": 1.4512, - "step": 228 - }, - { - "epoch": 0.31673582295988933, - "grad_norm": 1.240386962890625, - "learning_rate": 1.8874033619325124e-05, - "loss": 1.5531, - "step": 229 - }, - { - "epoch": 0.318118948824343, - "grad_norm": 1.289332389831543, - "learning_rate": 1.8863927051791418e-05, - "loss": 1.4774, - "step": 230 - }, - { - "epoch": 0.31950207468879666, - "grad_norm": 1.5051286220550537, - "learning_rate": 1.8853778059744716e-05, - "loss": 1.2835, - "step": 231 - }, - { - "epoch": 0.32088520055325037, - "grad_norm": 1.4394385814666748, - "learning_rate": 1.884358669176011e-05, - "loss": 1.4348, - "step": 232 - }, - { - "epoch": 0.32226832641770403, - "grad_norm": 1.3271335363388062, - "learning_rate": 1.8833352996615507e-05, - "loss": 1.4749, - "step": 233 - }, - { - "epoch": 0.3236514522821577, - "grad_norm": 1.3286136388778687, - "learning_rate": 1.88230770232914e-05, - "loss": 1.5111, - "step": 234 - }, - { - "epoch": 0.32503457814661135, - "grad_norm": 1.2873828411102295, - "learning_rate": 1.8812758820970637e-05, - "loss": 1.4664, - "step": 235 - }, - { - "epoch": 0.326417704011065, - "grad_norm": 1.3673721551895142, - "learning_rate": 1.8802398439038175e-05, - "loss": 1.3966, - "step": 236 - }, - { - "epoch": 0.3278008298755187, - "grad_norm": 1.306979775428772, - "learning_rate": 1.879199592708087e-05, - "loss": 1.5416, - "step": 237 - }, - { - "epoch": 0.32918395573997233, - "grad_norm": 1.191098690032959, - "learning_rate": 1.8781551334887204e-05, - "loss": 1.4901, - "step": 238 - }, - { - "epoch": 0.330567081604426, - "grad_norm": 1.3898661136627197, - "learning_rate": 1.8771064712447054e-05, - "loss": 1.6548, - "step": 239 - }, - { - "epoch": 0.33195020746887965, - "grad_norm": 1.3041338920593262, - "learning_rate": 1.876053610995149e-05, - "loss": 1.4909, - "step": 240 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 1.3957127332687378, - "learning_rate": 1.8749965577792482e-05, - "loss": 1.5244, - "step": 241 - }, - { - "epoch": 0.334716459197787, - "grad_norm": 1.3891314268112183, - "learning_rate": 1.87393531665627e-05, - "loss": 1.4816, - "step": 242 - }, - { - "epoch": 0.3360995850622407, - "grad_norm": 1.483026385307312, - "learning_rate": 1.872869892705525e-05, - "loss": 1.3794, - "step": 243 - }, - { - "epoch": 0.33748271092669435, - "grad_norm": 1.2564777135849, - "learning_rate": 1.8718002910263426e-05, - "loss": 1.5489, - "step": 244 - }, - { - "epoch": 0.338865836791148, - "grad_norm": 1.255433440208435, - "learning_rate": 1.8707265167380497e-05, - "loss": 1.4259, - "step": 245 - }, - { - "epoch": 0.34024896265560167, - "grad_norm": 1.4052809476852417, - "learning_rate": 1.869648574979942e-05, - "loss": 1.6745, - "step": 246 - }, - { - "epoch": 0.34163208852005533, - "grad_norm": 1.368813157081604, - "learning_rate": 1.8685664709112637e-05, - "loss": 1.5183, - "step": 247 - }, - { - "epoch": 0.343015214384509, - "grad_norm": 1.3451085090637207, - "learning_rate": 1.8674802097111784e-05, - "loss": 1.5095, - "step": 248 - }, - { - "epoch": 0.34439834024896265, - "grad_norm": 1.249712586402893, - "learning_rate": 1.8663897965787483e-05, - "loss": 1.5347, - "step": 249 - }, - { - "epoch": 0.3457814661134163, - "grad_norm": 1.2913545370101929, - "learning_rate": 1.865295236732907e-05, - "loss": 1.5025, - "step": 250 - }, - { - "epoch": 0.34716459197787, - "grad_norm": 1.3476046323776245, - "learning_rate": 1.8641965354124346e-05, - "loss": 1.5246, - "step": 251 - }, - { - "epoch": 0.34854771784232363, - "grad_norm": 1.253641963005066, - "learning_rate": 1.8630936978759337e-05, - "loss": 1.4977, - "step": 252 - }, - { - "epoch": 0.3499308437067773, - "grad_norm": 1.3167630434036255, - "learning_rate": 1.8619867294018035e-05, - "loss": 1.451, - "step": 253 - }, - { - "epoch": 0.35131396957123096, - "grad_norm": 1.3077702522277832, - "learning_rate": 1.8608756352882152e-05, - "loss": 1.5264, - "step": 254 - }, - { - "epoch": 0.35269709543568467, - "grad_norm": 1.2955269813537598, - "learning_rate": 1.8597604208530845e-05, - "loss": 1.4943, - "step": 255 - }, - { - "epoch": 0.35408022130013833, - "grad_norm": 1.418426513671875, - "learning_rate": 1.85864109143405e-05, - "loss": 1.5298, - "step": 256 - }, - { - "epoch": 0.355463347164592, - "grad_norm": 1.3700319528579712, - "learning_rate": 1.8575176523884432e-05, - "loss": 1.4887, - "step": 257 - }, - { - "epoch": 0.35684647302904565, - "grad_norm": 1.3327845335006714, - "learning_rate": 1.8563901090932673e-05, - "loss": 1.5672, - "step": 258 - }, - { - "epoch": 0.3582295988934993, - "grad_norm": 1.4686397314071655, - "learning_rate": 1.8552584669451675e-05, - "loss": 1.3965, - "step": 259 - }, - { - "epoch": 0.359612724757953, - "grad_norm": 1.267582654953003, - "learning_rate": 1.854122731360408e-05, - "loss": 1.5229, - "step": 260 - }, - { - "epoch": 0.36099585062240663, - "grad_norm": 1.347273826599121, - "learning_rate": 1.8529829077748442e-05, - "loss": 1.4324, - "step": 261 - }, - { - "epoch": 0.3623789764868603, - "grad_norm": 1.4890333414077759, - "learning_rate": 1.851839001643898e-05, - "loss": 1.4877, - "step": 262 - }, - { - "epoch": 0.36376210235131395, - "grad_norm": 1.374370813369751, - "learning_rate": 1.850691018442531e-05, - "loss": 1.5164, - "step": 263 - }, - { - "epoch": 0.3651452282157676, - "grad_norm": 1.3033865690231323, - "learning_rate": 1.8495389636652185e-05, - "loss": 1.4799, - "step": 264 - }, - { - "epoch": 0.3665283540802213, - "grad_norm": 1.321038007736206, - "learning_rate": 1.8483828428259235e-05, - "loss": 1.4816, - "step": 265 - }, - { - "epoch": 0.367911479944675, - "grad_norm": 1.4516509771347046, - "learning_rate": 1.847222661458069e-05, - "loss": 1.3904, - "step": 266 - }, - { - "epoch": 0.36929460580912865, - "grad_norm": 1.3580619096755981, - "learning_rate": 1.8460584251145137e-05, - "loss": 1.501, - "step": 267 - }, - { - "epoch": 0.3706777316735823, - "grad_norm": 1.3735978603363037, - "learning_rate": 1.8448901393675233e-05, - "loss": 1.5034, - "step": 268 - }, - { - "epoch": 0.37206085753803597, - "grad_norm": 1.522796630859375, - "learning_rate": 1.8437178098087452e-05, - "loss": 1.5287, - "step": 269 - }, - { - "epoch": 0.37344398340248963, - "grad_norm": 1.4429105520248413, - "learning_rate": 1.8425414420491817e-05, - "loss": 1.4459, - "step": 270 - }, - { - "epoch": 0.3748271092669433, - "grad_norm": 1.3284740447998047, - "learning_rate": 1.841361041719161e-05, - "loss": 1.4334, - "step": 271 - }, - { - "epoch": 0.37621023513139695, - "grad_norm": 1.262669563293457, - "learning_rate": 1.8401766144683145e-05, - "loss": 1.5067, - "step": 272 - }, - { - "epoch": 0.3775933609958506, - "grad_norm": 1.5458612442016602, - "learning_rate": 1.8389881659655456e-05, - "loss": 1.5672, - "step": 273 - }, - { - "epoch": 0.3789764868603043, - "grad_norm": 1.3672326803207397, - "learning_rate": 1.8377957018990043e-05, - "loss": 1.4507, - "step": 274 - }, - { - "epoch": 0.38035961272475793, - "grad_norm": 1.5172324180603027, - "learning_rate": 1.83659922797606e-05, - "loss": 1.5094, - "step": 275 - }, - { - "epoch": 0.3817427385892116, - "grad_norm": 1.3924052715301514, - "learning_rate": 1.8353987499232747e-05, - "loss": 1.4878, - "step": 276 - }, - { - "epoch": 0.3831258644536653, - "grad_norm": 1.601944923400879, - "learning_rate": 1.834194273486374e-05, - "loss": 1.2526, - "step": 277 - }, - { - "epoch": 0.38450899031811897, - "grad_norm": 1.3992019891738892, - "learning_rate": 1.8329858044302212e-05, - "loss": 1.4725, - "step": 278 - }, - { - "epoch": 0.38589211618257263, - "grad_norm": 1.2960071563720703, - "learning_rate": 1.8317733485387893e-05, - "loss": 1.4154, - "step": 279 - }, - { - "epoch": 0.3872752420470263, - "grad_norm": 1.7117806673049927, - "learning_rate": 1.830556911615132e-05, - "loss": 1.4874, - "step": 280 - }, - { - "epoch": 0.38865836791147995, - "grad_norm": 1.374509572982788, - "learning_rate": 1.8293364994813584e-05, - "loss": 1.518, - "step": 281 - }, - { - "epoch": 0.3900414937759336, - "grad_norm": 1.2234368324279785, - "learning_rate": 1.8281121179786024e-05, - "loss": 1.1919, - "step": 282 - }, - { - "epoch": 0.3914246196403873, - "grad_norm": 1.4747345447540283, - "learning_rate": 1.826883772966997e-05, - "loss": 1.5335, - "step": 283 - }, - { - "epoch": 0.39280774550484093, - "grad_norm": 1.4171055555343628, - "learning_rate": 1.825651470325645e-05, - "loss": 1.4572, - "step": 284 - }, - { - "epoch": 0.3941908713692946, - "grad_norm": 1.4510976076126099, - "learning_rate": 1.8244152159525916e-05, - "loss": 1.4552, - "step": 285 - }, - { - "epoch": 0.39557399723374825, - "grad_norm": 1.3574010133743286, - "learning_rate": 1.823175015764795e-05, - "loss": 1.4206, - "step": 286 - }, - { - "epoch": 0.3969571230982019, - "grad_norm": 1.4223361015319824, - "learning_rate": 1.821930875698099e-05, - "loss": 1.5357, - "step": 287 - }, - { - "epoch": 0.3983402489626556, - "grad_norm": 1.4017705917358398, - "learning_rate": 1.8206828017072057e-05, - "loss": 1.4742, - "step": 288 - }, - { - "epoch": 0.3997233748271093, - "grad_norm": 1.5125157833099365, - "learning_rate": 1.819430799765644e-05, - "loss": 1.4549, - "step": 289 - }, - { - "epoch": 0.40110650069156295, - "grad_norm": 1.6485176086425781, - "learning_rate": 1.818174875865744e-05, - "loss": 1.4493, - "step": 290 - }, - { - "epoch": 0.4024896265560166, - "grad_norm": 1.5857341289520264, - "learning_rate": 1.8169150360186062e-05, - "loss": 1.5252, - "step": 291 - }, - { - "epoch": 0.40387275242047027, - "grad_norm": 1.4653831720352173, - "learning_rate": 1.815651286254074e-05, - "loss": 1.5078, - "step": 292 - }, - { - "epoch": 0.40525587828492393, - "grad_norm": 1.3680520057678223, - "learning_rate": 1.8143836326207048e-05, - "loss": 1.5462, - "step": 293 - }, - { - "epoch": 0.4066390041493776, - "grad_norm": 1.434511661529541, - "learning_rate": 1.8131120811857398e-05, - "loss": 1.5447, - "step": 294 - }, - { - "epoch": 0.40802213001383125, - "grad_norm": 1.527446985244751, - "learning_rate": 1.8118366380350773e-05, - "loss": 1.4645, - "step": 295 - }, - { - "epoch": 0.4094052558782849, - "grad_norm": 1.2728767395019531, - "learning_rate": 1.81055730927324e-05, - "loss": 1.1936, - "step": 296 - }, - { - "epoch": 0.4107883817427386, - "grad_norm": 1.5309098958969116, - "learning_rate": 1.8092741010233496e-05, - "loss": 1.3846, - "step": 297 - }, - { - "epoch": 0.41217150760719223, - "grad_norm": 1.6225330829620361, - "learning_rate": 1.8079870194270958e-05, - "loss": 1.455, - "step": 298 - }, - { - "epoch": 0.4135546334716459, - "grad_norm": 1.4223120212554932, - "learning_rate": 1.806696070644706e-05, - "loss": 1.4813, - "step": 299 - }, - { - "epoch": 0.4149377593360996, - "grad_norm": 1.55156672000885, - "learning_rate": 1.8054012608549167e-05, - "loss": 1.5128, - "step": 300 - }, - { - "epoch": 0.41632088520055327, - "grad_norm": 1.8663321733474731, - "learning_rate": 1.804102596254945e-05, - "loss": 1.3645, - "step": 301 - }, - { - "epoch": 0.41770401106500693, - "grad_norm": 1.597420334815979, - "learning_rate": 1.802800083060457e-05, - "loss": 1.5162, - "step": 302 - }, - { - "epoch": 0.4190871369294606, - "grad_norm": 1.5862244367599487, - "learning_rate": 1.8014937275055393e-05, - "loss": 1.4336, - "step": 303 - }, - { - "epoch": 0.42047026279391425, - "grad_norm": 1.530361533164978, - "learning_rate": 1.8001835358426688e-05, - "loss": 1.5836, - "step": 304 - }, - { - "epoch": 0.4218533886583679, - "grad_norm": 1.4132399559020996, - "learning_rate": 1.798869514342682e-05, - "loss": 1.5451, - "step": 305 - }, - { - "epoch": 0.42323651452282157, - "grad_norm": 1.5127052068710327, - "learning_rate": 1.7975516692947478e-05, - "loss": 1.3983, - "step": 306 - }, - { - "epoch": 0.42461964038727523, - "grad_norm": 1.491930365562439, - "learning_rate": 1.7962300070063325e-05, - "loss": 1.5258, - "step": 307 - }, - { - "epoch": 0.4260027662517289, - "grad_norm": 1.5085482597351074, - "learning_rate": 1.7949045338031744e-05, - "loss": 1.417, - "step": 308 - }, - { - "epoch": 0.42738589211618255, - "grad_norm": 1.5667784214019775, - "learning_rate": 1.793575256029252e-05, - "loss": 1.3179, - "step": 309 - }, - { - "epoch": 0.4287690179806362, - "grad_norm": 1.6200470924377441, - "learning_rate": 1.7922421800467515e-05, - "loss": 1.4283, - "step": 310 - }, - { - "epoch": 0.43015214384508993, - "grad_norm": 1.7292104959487915, - "learning_rate": 1.790905312236039e-05, - "loss": 1.4993, - "step": 311 - }, - { - "epoch": 0.4315352697095436, - "grad_norm": 1.44505774974823, - "learning_rate": 1.7895646589956294e-05, - "loss": 1.5516, - "step": 312 - }, - { - "epoch": 0.43291839557399725, - "grad_norm": 1.5757036209106445, - "learning_rate": 1.7882202267421544e-05, - "loss": 1.4155, - "step": 313 - }, - { - "epoch": 0.4343015214384509, - "grad_norm": 1.7449678182601929, - "learning_rate": 1.7868720219103343e-05, - "loss": 1.4392, - "step": 314 - }, - { - "epoch": 0.43568464730290457, - "grad_norm": 1.5077069997787476, - "learning_rate": 1.7855200509529442e-05, - "loss": 1.4805, - "step": 315 - }, - { - "epoch": 0.43706777316735823, - "grad_norm": 1.488542079925537, - "learning_rate": 1.7841643203407854e-05, - "loss": 1.4102, - "step": 316 - }, - { - "epoch": 0.4384508990318119, - "grad_norm": 1.5566229820251465, - "learning_rate": 1.7828048365626536e-05, - "loss": 1.4242, - "step": 317 - }, - { - "epoch": 0.43983402489626555, - "grad_norm": 1.5665713548660278, - "learning_rate": 1.7814416061253076e-05, - "loss": 1.4923, - "step": 318 - }, - { - "epoch": 0.4412171507607192, - "grad_norm": 1.4140158891677856, - "learning_rate": 1.780074635553439e-05, - "loss": 1.5374, - "step": 319 - }, - { - "epoch": 0.4426002766251729, - "grad_norm": 1.3860362768173218, - "learning_rate": 1.77870393138964e-05, - "loss": 1.564, - "step": 320 - }, - { - "epoch": 0.44398340248962653, - "grad_norm": 1.4771426916122437, - "learning_rate": 1.7773295001943725e-05, - "loss": 1.4382, - "step": 321 - }, - { - "epoch": 0.44536652835408025, - "grad_norm": 1.4840291738510132, - "learning_rate": 1.7759513485459367e-05, - "loss": 1.4851, - "step": 322 - }, - { - "epoch": 0.4467496542185339, - "grad_norm": 1.6433287858963013, - "learning_rate": 1.77456948304044e-05, - "loss": 1.4411, - "step": 323 - }, - { - "epoch": 0.44813278008298757, - "grad_norm": 1.5386046171188354, - "learning_rate": 1.7731839102917646e-05, - "loss": 1.5255, - "step": 324 - }, - { - "epoch": 0.44951590594744123, - "grad_norm": 1.466110110282898, - "learning_rate": 1.7717946369315365e-05, - "loss": 1.4595, - "step": 325 - }, - { - "epoch": 0.4508990318118949, - "grad_norm": 1.6857801675796509, - "learning_rate": 1.7704016696090936e-05, - "loss": 1.5345, - "step": 326 - }, - { - "epoch": 0.45228215767634855, - "grad_norm": 1.8097199201583862, - "learning_rate": 1.769005014991454e-05, - "loss": 1.3434, - "step": 327 - }, - { - "epoch": 0.4536652835408022, - "grad_norm": 1.843514084815979, - "learning_rate": 1.7676046797632834e-05, - "loss": 1.3947, - "step": 328 - }, - { - "epoch": 0.45504840940525587, - "grad_norm": 1.476454496383667, - "learning_rate": 1.7662006706268642e-05, - "loss": 1.5038, - "step": 329 - }, - { - "epoch": 0.45643153526970953, - "grad_norm": 1.6218907833099365, - "learning_rate": 1.7647929943020625e-05, - "loss": 1.4362, - "step": 330 - }, - { - "epoch": 0.4578146611341632, - "grad_norm": 1.481621503829956, - "learning_rate": 1.7633816575262966e-05, - "loss": 1.4619, - "step": 331 - }, - { - "epoch": 0.45919778699861685, - "grad_norm": 1.5661985874176025, - "learning_rate": 1.7619666670545034e-05, - "loss": 1.4317, - "step": 332 - }, - { - "epoch": 0.4605809128630705, - "grad_norm": 1.580833911895752, - "learning_rate": 1.7605480296591092e-05, - "loss": 1.3814, - "step": 333 - }, - { - "epoch": 0.46196403872752423, - "grad_norm": 1.473464012145996, - "learning_rate": 1.759125752129993e-05, - "loss": 1.4584, - "step": 334 - }, - { - "epoch": 0.4633471645919779, - "grad_norm": 1.6730810403823853, - "learning_rate": 1.757699841274458e-05, - "loss": 1.3358, - "step": 335 - }, - { - "epoch": 0.46473029045643155, - "grad_norm": 1.4953693151474, - "learning_rate": 1.7562703039171955e-05, - "loss": 1.5387, - "step": 336 - }, - { - "epoch": 0.4661134163208852, - "grad_norm": 1.5851799249649048, - "learning_rate": 1.7548371469002553e-05, - "loss": 1.5365, - "step": 337 - }, - { - "epoch": 0.46749654218533887, - "grad_norm": 1.5487456321716309, - "learning_rate": 1.753400377083011e-05, - "loss": 1.4721, - "step": 338 - }, - { - "epoch": 0.46887966804979253, - "grad_norm": 1.6268632411956787, - "learning_rate": 1.7519600013421282e-05, - "loss": 1.3934, - "step": 339 - }, - { - "epoch": 0.4702627939142462, - "grad_norm": 1.68012535572052, - "learning_rate": 1.7505160265715303e-05, - "loss": 1.5813, - "step": 340 - }, - { - "epoch": 0.47164591977869985, - "grad_norm": 1.6492486000061035, - "learning_rate": 1.7490684596823678e-05, - "loss": 1.4777, - "step": 341 - }, - { - "epoch": 0.4730290456431535, - "grad_norm": 1.950250267982483, - "learning_rate": 1.747617307602982e-05, - "loss": 1.5284, - "step": 342 - }, - { - "epoch": 0.47441217150760717, - "grad_norm": 1.640457272529602, - "learning_rate": 1.7461625772788755e-05, - "loss": 1.4236, - "step": 343 - }, - { - "epoch": 0.47579529737206083, - "grad_norm": 1.4680976867675781, - "learning_rate": 1.7447042756726756e-05, - "loss": 1.5607, - "step": 344 - }, - { - "epoch": 0.47717842323651455, - "grad_norm": 1.6982401609420776, - "learning_rate": 1.743242409764103e-05, - "loss": 1.511, - "step": 345 - }, - { - "epoch": 0.4785615491009682, - "grad_norm": 1.6441048383712769, - "learning_rate": 1.741776986549938e-05, - "loss": 1.4762, - "step": 346 - }, - { - "epoch": 0.47994467496542187, - "grad_norm": 1.6681658029556274, - "learning_rate": 1.7403080130439874e-05, - "loss": 1.5238, - "step": 347 - }, - { - "epoch": 0.48132780082987553, - "grad_norm": 1.6259121894836426, - "learning_rate": 1.7388354962770488e-05, - "loss": 1.459, - "step": 348 - }, - { - "epoch": 0.4827109266943292, - "grad_norm": 1.6403274536132812, - "learning_rate": 1.7373594432968798e-05, - "loss": 1.4974, - "step": 349 - }, - { - "epoch": 0.48409405255878285, - "grad_norm": 1.585744857788086, - "learning_rate": 1.735879861168163e-05, - "loss": 1.5599, - "step": 350 - }, - { - "epoch": 0.4854771784232365, - "grad_norm": 1.6174416542053223, - "learning_rate": 1.7343967569724716e-05, - "loss": 1.475, - "step": 351 - }, - { - "epoch": 0.48686030428769017, - "grad_norm": 1.6598519086837769, - "learning_rate": 1.7329101378082374e-05, - "loss": 1.1199, - "step": 352 - }, - { - "epoch": 0.48824343015214383, - "grad_norm": 1.6443450450897217, - "learning_rate": 1.731420010790713e-05, - "loss": 1.527, - "step": 353 - }, - { - "epoch": 0.4896265560165975, - "grad_norm": 1.7666903734207153, - "learning_rate": 1.729926383051943e-05, - "loss": 1.4852, - "step": 354 - }, - { - "epoch": 0.49100968188105115, - "grad_norm": 1.7390762567520142, - "learning_rate": 1.7284292617407253e-05, - "loss": 1.369, - "step": 355 - }, - { - "epoch": 0.49239280774550487, - "grad_norm": 1.7170841693878174, - "learning_rate": 1.7269286540225805e-05, - "loss": 1.4483, - "step": 356 - }, - { - "epoch": 0.49377593360995853, - "grad_norm": 1.6285370588302612, - "learning_rate": 1.725424567079714e-05, - "loss": 1.4945, - "step": 357 - }, - { - "epoch": 0.4951590594744122, - "grad_norm": 1.6297553777694702, - "learning_rate": 1.723917008110984e-05, - "loss": 1.4291, - "step": 358 - }, - { - "epoch": 0.49654218533886585, - "grad_norm": 1.5169857740402222, - "learning_rate": 1.722405984331867e-05, - "loss": 1.4683, - "step": 359 - }, - { - "epoch": 0.4979253112033195, - "grad_norm": 1.729242205619812, - "learning_rate": 1.720891502974423e-05, - "loss": 1.3923, - "step": 360 - }, - { - "epoch": 0.49930843706777317, - "grad_norm": 1.7176477909088135, - "learning_rate": 1.7193735712872598e-05, - "loss": 1.4261, - "step": 361 - }, - { - "epoch": 0.5006915629322268, - "grad_norm": 1.7728772163391113, - "learning_rate": 1.7178521965354992e-05, - "loss": 1.4334, - "step": 362 - }, - { - "epoch": 0.5006915629322268, - "eval_loss": 1.147789478302002, - "eval_runtime": 108.7922, - "eval_samples_per_second": 13.273, - "eval_steps_per_second": 0.836, - "step": 362 - }, - { - "epoch": 0.5020746887966805, - "grad_norm": 1.6068779230117798, - "learning_rate": 1.7163273860007434e-05, - "loss": 1.4989, - "step": 363 - }, - { - "epoch": 0.5034578146611342, - "grad_norm": 1.751523494720459, - "learning_rate": 1.714799146981037e-05, - "loss": 1.4278, - "step": 364 - }, - { - "epoch": 0.5048409405255878, - "grad_norm": 1.6334980726242065, - "learning_rate": 1.7132674867908354e-05, - "loss": 1.4772, - "step": 365 - }, - { - "epoch": 0.5062240663900415, - "grad_norm": 1.5768158435821533, - "learning_rate": 1.7117324127609686e-05, - "loss": 1.4354, - "step": 366 - }, - { - "epoch": 0.5076071922544951, - "grad_norm": 1.7999041080474854, - "learning_rate": 1.710193932238605e-05, - "loss": 1.3682, - "step": 367 - }, - { - "epoch": 0.5089903181189488, - "grad_norm": 1.8190585374832153, - "learning_rate": 1.7086520525872173e-05, - "loss": 1.4741, - "step": 368 - }, - { - "epoch": 0.5103734439834025, - "grad_norm": 1.825280785560608, - "learning_rate": 1.7071067811865477e-05, - "loss": 1.3532, - "step": 369 - }, - { - "epoch": 0.5117565698478561, - "grad_norm": 1.862087607383728, - "learning_rate": 1.7055581254325716e-05, - "loss": 1.4157, - "step": 370 - }, - { - "epoch": 0.5131396957123098, - "grad_norm": 1.8708950281143188, - "learning_rate": 1.7040060927374626e-05, - "loss": 1.4732, - "step": 371 - }, - { - "epoch": 0.5145228215767634, - "grad_norm": 1.5391322374343872, - "learning_rate": 1.7024506905295566e-05, - "loss": 1.4484, - "step": 372 - }, - { - "epoch": 0.5159059474412172, - "grad_norm": 1.7950142621994019, - "learning_rate": 1.7008919262533174e-05, - "loss": 1.4488, - "step": 373 - }, - { - "epoch": 0.5172890733056709, - "grad_norm": 1.7594027519226074, - "learning_rate": 1.6993298073693005e-05, - "loss": 1.4757, - "step": 374 - }, - { - "epoch": 0.5186721991701245, - "grad_norm": 1.7801642417907715, - "learning_rate": 1.6977643413541156e-05, - "loss": 1.4611, - "step": 375 - }, - { - "epoch": 0.5200553250345782, - "grad_norm": 1.7385730743408203, - "learning_rate": 1.6961955357003948e-05, - "loss": 1.5696, - "step": 376 - }, - { - "epoch": 0.5214384508990318, - "grad_norm": 2.0058438777923584, - "learning_rate": 1.6946233979167516e-05, - "loss": 1.3498, - "step": 377 - }, - { - "epoch": 0.5228215767634855, - "grad_norm": 1.7878988981246948, - "learning_rate": 1.693047935527751e-05, - "loss": 1.4722, - "step": 378 - }, - { - "epoch": 0.5242047026279392, - "grad_norm": 1.900752067565918, - "learning_rate": 1.6914691560738675e-05, - "loss": 1.418, - "step": 379 - }, - { - "epoch": 0.5255878284923928, - "grad_norm": 1.7536234855651855, - "learning_rate": 1.6898870671114527e-05, - "loss": 1.4366, - "step": 380 - }, - { - "epoch": 0.5269709543568465, - "grad_norm": 1.7499502897262573, - "learning_rate": 1.6883016762126986e-05, - "loss": 1.3753, - "step": 381 - }, - { - "epoch": 0.5283540802213001, - "grad_norm": 1.7654677629470825, - "learning_rate": 1.6867129909656e-05, - "loss": 1.3705, - "step": 382 - }, - { - "epoch": 0.5297372060857538, - "grad_norm": 1.679612159729004, - "learning_rate": 1.6851210189739195e-05, - "loss": 1.5566, - "step": 383 - }, - { - "epoch": 0.5311203319502075, - "grad_norm": 1.8122912645339966, - "learning_rate": 1.6835257678571515e-05, - "loss": 1.4993, - "step": 384 - }, - { - "epoch": 0.5325034578146611, - "grad_norm": 1.8270299434661865, - "learning_rate": 1.681927245250484e-05, - "loss": 1.4321, - "step": 385 - }, - { - "epoch": 0.5338865836791148, - "grad_norm": 1.7611494064331055, - "learning_rate": 1.680325458804763e-05, - "loss": 1.4337, - "step": 386 - }, - { - "epoch": 0.5352697095435685, - "grad_norm": 1.650876760482788, - "learning_rate": 1.6787204161864562e-05, - "loss": 1.4233, - "step": 387 - }, - { - "epoch": 0.5366528354080221, - "grad_norm": 1.6672831773757935, - "learning_rate": 1.6771121250776163e-05, - "loss": 1.3721, - "step": 388 - }, - { - "epoch": 0.5380359612724758, - "grad_norm": 1.7278107404708862, - "learning_rate": 1.675500593175843e-05, - "loss": 1.51, - "step": 389 - }, - { - "epoch": 0.5394190871369294, - "grad_norm": 1.7551602125167847, - "learning_rate": 1.6738858281942477e-05, - "loss": 1.5643, - "step": 390 - }, - { - "epoch": 0.5408022130013831, - "grad_norm": 1.679298996925354, - "learning_rate": 1.6722678378614164e-05, - "loss": 1.466, - "step": 391 - }, - { - "epoch": 0.5421853388658368, - "grad_norm": 1.9446730613708496, - "learning_rate": 1.6706466299213718e-05, - "loss": 1.4915, - "step": 392 - }, - { - "epoch": 0.5435684647302904, - "grad_norm": 1.822323203086853, - "learning_rate": 1.6690222121335357e-05, - "loss": 1.4275, - "step": 393 - }, - { - "epoch": 0.5449515905947441, - "grad_norm": 2.0493321418762207, - "learning_rate": 1.6673945922726945e-05, - "loss": 1.3294, - "step": 394 - }, - { - "epoch": 0.5463347164591977, - "grad_norm": 1.8270610570907593, - "learning_rate": 1.6657637781289596e-05, - "loss": 1.4471, - "step": 395 - }, - { - "epoch": 0.5477178423236515, - "grad_norm": 1.9434516429901123, - "learning_rate": 1.6641297775077313e-05, - "loss": 1.4606, - "step": 396 - }, - { - "epoch": 0.5491009681881052, - "grad_norm": 1.714198350906372, - "learning_rate": 1.66249259822966e-05, - "loss": 1.4398, - "step": 397 - }, - { - "epoch": 0.5504840940525588, - "grad_norm": 1.787815809249878, - "learning_rate": 1.660852248130611e-05, - "loss": 1.3972, - "step": 398 - }, - { - "epoch": 0.5518672199170125, - "grad_norm": 1.9069771766662598, - "learning_rate": 1.6592087350616245e-05, - "loss": 1.4049, - "step": 399 - }, - { - "epoch": 0.5532503457814661, - "grad_norm": 1.8910343647003174, - "learning_rate": 1.6575620668888812e-05, - "loss": 1.3682, - "step": 400 - }, - { - "epoch": 0.5546334716459198, - "grad_norm": 1.7473318576812744, - "learning_rate": 1.6559122514936606e-05, - "loss": 1.3818, - "step": 401 - }, - { - "epoch": 0.5560165975103735, - "grad_norm": 1.792608380317688, - "learning_rate": 1.6542592967723065e-05, - "loss": 1.343, - "step": 402 - }, - { - "epoch": 0.5573997233748271, - "grad_norm": 1.8886102437973022, - "learning_rate": 1.6526032106361888e-05, - "loss": 1.3608, - "step": 403 - }, - { - "epoch": 0.5587828492392808, - "grad_norm": 1.8249069452285767, - "learning_rate": 1.6509440010116634e-05, - "loss": 1.2995, - "step": 404 - }, - { - "epoch": 0.5601659751037344, - "grad_norm": 1.9114660024642944, - "learning_rate": 1.649281675840037e-05, - "loss": 1.4432, - "step": 405 - }, - { - "epoch": 0.5615491009681881, - "grad_norm": 1.9942965507507324, - "learning_rate": 1.6476162430775278e-05, - "loss": 1.4114, - "step": 406 - }, - { - "epoch": 0.5629322268326418, - "grad_norm": 1.7948976755142212, - "learning_rate": 1.645947710695227e-05, - "loss": 1.3939, - "step": 407 - }, - { - "epoch": 0.5643153526970954, - "grad_norm": 1.7326204776763916, - "learning_rate": 1.6442760866790616e-05, - "loss": 1.4526, - "step": 408 - }, - { - "epoch": 0.5656984785615491, - "grad_norm": 1.7523624897003174, - "learning_rate": 1.6426013790297556e-05, - "loss": 1.5081, - "step": 409 - }, - { - "epoch": 0.5670816044260027, - "grad_norm": 2.0211379528045654, - "learning_rate": 1.6409235957627926e-05, - "loss": 1.281, - "step": 410 - }, - { - "epoch": 0.5684647302904564, - "grad_norm": 2.0155961513519287, - "learning_rate": 1.639242744908375e-05, - "loss": 1.327, - "step": 411 - }, - { - "epoch": 0.5698478561549101, - "grad_norm": 1.7048110961914062, - "learning_rate": 1.6375588345113895e-05, - "loss": 1.5404, - "step": 412 - }, - { - "epoch": 0.5712309820193637, - "grad_norm": 1.8413456678390503, - "learning_rate": 1.6358718726313645e-05, - "loss": 1.5062, - "step": 413 - }, - { - "epoch": 0.5726141078838174, - "grad_norm": 1.6750757694244385, - "learning_rate": 1.6341818673424342e-05, - "loss": 0.9839, - "step": 414 - }, - { - "epoch": 0.573997233748271, - "grad_norm": 1.7165724039077759, - "learning_rate": 1.6324888267332998e-05, - "loss": 1.4027, - "step": 415 - }, - { - "epoch": 0.5753803596127247, - "grad_norm": 1.8710963726043701, - "learning_rate": 1.630792758907189e-05, - "loss": 1.3251, - "step": 416 - }, - { - "epoch": 0.5767634854771784, - "grad_norm": 1.9592095613479614, - "learning_rate": 1.6290936719818182e-05, - "loss": 1.4502, - "step": 417 - }, - { - "epoch": 0.5781466113416321, - "grad_norm": 1.7474467754364014, - "learning_rate": 1.6273915740893557e-05, - "loss": 1.3636, - "step": 418 - }, - { - "epoch": 0.5795297372060858, - "grad_norm": 1.893127202987671, - "learning_rate": 1.6256864733763787e-05, - "loss": 1.4927, - "step": 419 - }, - { - "epoch": 0.5809128630705395, - "grad_norm": 1.8203260898590088, - "learning_rate": 1.6239783780038374e-05, - "loss": 1.493, - "step": 420 - }, - { - "epoch": 0.5822959889349931, - "grad_norm": 2.0269949436187744, - "learning_rate": 1.6222672961470158e-05, - "loss": 1.3748, - "step": 421 - }, - { - "epoch": 0.5836791147994468, - "grad_norm": 1.7836514711380005, - "learning_rate": 1.6205532359954905e-05, - "loss": 1.3786, - "step": 422 - }, - { - "epoch": 0.5850622406639004, - "grad_norm": 1.7503560781478882, - "learning_rate": 1.618836205753093e-05, - "loss": 1.3905, - "step": 423 - }, - { - "epoch": 0.5864453665283541, - "grad_norm": 1.9079328775405884, - "learning_rate": 1.6171162136378716e-05, - "loss": 1.51, - "step": 424 - }, - { - "epoch": 0.5878284923928078, - "grad_norm": 2.0547492504119873, - "learning_rate": 1.6153932678820487e-05, - "loss": 1.5062, - "step": 425 - }, - { - "epoch": 0.5892116182572614, - "grad_norm": 1.9388115406036377, - "learning_rate": 1.6136673767319853e-05, - "loss": 1.5496, - "step": 426 - }, - { - "epoch": 0.5905947441217151, - "grad_norm": 1.8805044889450073, - "learning_rate": 1.611938548448138e-05, - "loss": 1.4072, - "step": 427 - }, - { - "epoch": 0.5919778699861687, - "grad_norm": 1.9267683029174805, - "learning_rate": 1.6102067913050227e-05, - "loss": 1.4047, - "step": 428 - }, - { - "epoch": 0.5933609958506224, - "grad_norm": 1.8546456098556519, - "learning_rate": 1.6084721135911715e-05, - "loss": 1.4631, - "step": 429 - }, - { - "epoch": 0.5947441217150761, - "grad_norm": 1.7463114261627197, - "learning_rate": 1.606734523609097e-05, - "loss": 1.4703, - "step": 430 - }, - { - "epoch": 0.5961272475795297, - "grad_norm": 2.014925003051758, - "learning_rate": 1.604994029675249e-05, - "loss": 1.294, - "step": 431 - }, - { - "epoch": 0.5975103734439834, - "grad_norm": 2.1670687198638916, - "learning_rate": 1.603250640119977e-05, - "loss": 1.2759, - "step": 432 - }, - { - "epoch": 0.598893499308437, - "grad_norm": 1.9688105583190918, - "learning_rate": 1.601504363287489e-05, - "loss": 1.4053, - "step": 433 - }, - { - "epoch": 0.6002766251728907, - "grad_norm": 2.0988128185272217, - "learning_rate": 1.5997552075358122e-05, - "loss": 1.3473, - "step": 434 - }, - { - "epoch": 0.6016597510373444, - "grad_norm": 2.176347255706787, - "learning_rate": 1.598003181236753e-05, - "loss": 1.3099, - "step": 435 - }, - { - "epoch": 0.603042876901798, - "grad_norm": 2.1589407920837402, - "learning_rate": 1.5962482927758568e-05, - "loss": 1.3499, - "step": 436 - }, - { - "epoch": 0.6044260027662517, - "grad_norm": 1.9357820749282837, - "learning_rate": 1.5944905505523677e-05, - "loss": 1.3215, - "step": 437 - }, - { - "epoch": 0.6058091286307054, - "grad_norm": 1.7777889966964722, - "learning_rate": 1.592729962979189e-05, - "loss": 1.4082, - "step": 438 - }, - { - "epoch": 0.607192254495159, - "grad_norm": 2.0036303997039795, - "learning_rate": 1.590966538482842e-05, - "loss": 1.4111, - "step": 439 - }, - { - "epoch": 0.6085753803596127, - "grad_norm": 1.9790940284729004, - "learning_rate": 1.589200285503426e-05, - "loss": 1.3613, - "step": 440 - }, - { - "epoch": 0.6099585062240664, - "grad_norm": 1.9340951442718506, - "learning_rate": 1.5874312124945773e-05, - "loss": 1.3719, - "step": 441 - }, - { - "epoch": 0.6113416320885201, - "grad_norm": 2.064161777496338, - "learning_rate": 1.585659327923432e-05, - "loss": 1.414, - "step": 442 - }, - { - "epoch": 0.6127247579529738, - "grad_norm": 1.955349326133728, - "learning_rate": 1.5838846402705793e-05, - "loss": 1.4853, - "step": 443 - }, - { - "epoch": 0.6141078838174274, - "grad_norm": 2.2459092140197754, - "learning_rate": 1.582107158030027e-05, - "loss": 1.2444, - "step": 444 - }, - { - "epoch": 0.6154910096818811, - "grad_norm": 1.9782187938690186, - "learning_rate": 1.5803268897091582e-05, - "loss": 1.3512, - "step": 445 - }, - { - "epoch": 0.6168741355463347, - "grad_norm": 1.6818398237228394, - "learning_rate": 1.5785438438286892e-05, - "loss": 1.0878, - "step": 446 - }, - { - "epoch": 0.6182572614107884, - "grad_norm": 2.271744728088379, - "learning_rate": 1.576758028922632e-05, - "loss": 1.5163, - "step": 447 - }, - { - "epoch": 0.6196403872752421, - "grad_norm": 2.1444921493530273, - "learning_rate": 1.574969453538251e-05, - "loss": 1.4333, - "step": 448 - }, - { - "epoch": 0.6210235131396957, - "grad_norm": 2.0727901458740234, - "learning_rate": 1.573178126236022e-05, - "loss": 1.5082, - "step": 449 - }, - { - "epoch": 0.6224066390041494, - "grad_norm": 1.9815788269042969, - "learning_rate": 1.5713840555895937e-05, - "loss": 1.3223, - "step": 450 - }, - { - "epoch": 0.623789764868603, - "grad_norm": 2.123006820678711, - "learning_rate": 1.569587250185743e-05, - "loss": 1.3614, - "step": 451 - }, - { - "epoch": 0.6251728907330567, - "grad_norm": 2.089308738708496, - "learning_rate": 1.567787718624338e-05, - "loss": 1.3747, - "step": 452 - }, - { - "epoch": 0.6265560165975104, - "grad_norm": 2.087164878845215, - "learning_rate": 1.5659854695182928e-05, - "loss": 1.4676, - "step": 453 - }, - { - "epoch": 0.627939142461964, - "grad_norm": 1.9156312942504883, - "learning_rate": 1.5641805114935297e-05, - "loss": 1.4936, - "step": 454 - }, - { - "epoch": 0.6293222683264177, - "grad_norm": 2.051100492477417, - "learning_rate": 1.5623728531889356e-05, - "loss": 1.3817, - "step": 455 - }, - { - "epoch": 0.6307053941908713, - "grad_norm": 1.8827084302902222, - "learning_rate": 1.560562503256322e-05, - "loss": 1.4932, - "step": 456 - }, - { - "epoch": 0.632088520055325, - "grad_norm": 1.9822756052017212, - "learning_rate": 1.558749470360382e-05, - "loss": 1.3448, - "step": 457 - }, - { - "epoch": 0.6334716459197787, - "grad_norm": 2.2374722957611084, - "learning_rate": 1.556933763178651e-05, - "loss": 1.4071, - "step": 458 - }, - { - "epoch": 0.6348547717842323, - "grad_norm": 2.2126710414886475, - "learning_rate": 1.5551153904014645e-05, - "loss": 1.3177, - "step": 459 - }, - { - "epoch": 0.636237897648686, - "grad_norm": 1.9302269220352173, - "learning_rate": 1.5532943607319143e-05, - "loss": 1.3762, - "step": 460 - }, - { - "epoch": 0.6376210235131397, - "grad_norm": 2.0682942867279053, - "learning_rate": 1.5514706828858096e-05, - "loss": 1.4728, - "step": 461 - }, - { - "epoch": 0.6390041493775933, - "grad_norm": 1.912061333656311, - "learning_rate": 1.5496443655916348e-05, - "loss": 1.3751, - "step": 462 - }, - { - "epoch": 0.640387275242047, - "grad_norm": 2.072456121444702, - "learning_rate": 1.547815417590506e-05, - "loss": 1.3737, - "step": 463 - }, - { - "epoch": 0.6417704011065007, - "grad_norm": 2.1260902881622314, - "learning_rate": 1.5459838476361326e-05, - "loss": 1.4867, - "step": 464 - }, - { - "epoch": 0.6431535269709544, - "grad_norm": 2.0210676193237305, - "learning_rate": 1.54414966449477e-05, - "loss": 1.4505, - "step": 465 - }, - { - "epoch": 0.6445366528354081, - "grad_norm": 1.9175024032592773, - "learning_rate": 1.5423128769451832e-05, - "loss": 1.4385, - "step": 466 - }, - { - "epoch": 0.6459197786998617, - "grad_norm": 1.7762582302093506, - "learning_rate": 1.5404734937786017e-05, - "loss": 1.0755, - "step": 467 - }, - { - "epoch": 0.6473029045643154, - "grad_norm": 2.193601131439209, - "learning_rate": 1.5386315237986785e-05, - "loss": 1.4005, - "step": 468 - }, - { - "epoch": 0.648686030428769, - "grad_norm": 2.159917116165161, - "learning_rate": 1.5367869758214466e-05, - "loss": 1.4358, - "step": 469 - }, - { - "epoch": 0.6500691562932227, - "grad_norm": 2.10957932472229, - "learning_rate": 1.5349398586752794e-05, - "loss": 1.3415, - "step": 470 - }, - { - "epoch": 0.6514522821576764, - "grad_norm": 2.0722029209136963, - "learning_rate": 1.533090181200845e-05, - "loss": 1.5206, - "step": 471 - }, - { - "epoch": 0.65283540802213, - "grad_norm": 2.0709426403045654, - "learning_rate": 1.5312379522510666e-05, - "loss": 1.5009, - "step": 472 - }, - { - "epoch": 0.6542185338865837, - "grad_norm": 2.1032183170318604, - "learning_rate": 1.5293831806910803e-05, - "loss": 1.4927, - "step": 473 - }, - { - "epoch": 0.6556016597510373, - "grad_norm": 2.0395522117614746, - "learning_rate": 1.52752587539819e-05, - "loss": 1.4599, - "step": 474 - }, - { - "epoch": 0.656984785615491, - "grad_norm": 1.9160783290863037, - "learning_rate": 1.5256660452618276e-05, - "loss": 1.389, - "step": 475 - }, - { - "epoch": 0.6583679114799447, - "grad_norm": 2.017306089401245, - "learning_rate": 1.5238036991835085e-05, - "loss": 1.3694, - "step": 476 - }, - { - "epoch": 0.6597510373443983, - "grad_norm": 2.1166417598724365, - "learning_rate": 1.521938846076791e-05, - "loss": 1.2891, - "step": 477 - }, - { - "epoch": 0.661134163208852, - "grad_norm": 2.1959710121154785, - "learning_rate": 1.5200714948672313e-05, - "loss": 1.4218, - "step": 478 - }, - { - "epoch": 0.6625172890733056, - "grad_norm": 2.0393929481506348, - "learning_rate": 1.5182016544923432e-05, - "loss": 1.3561, - "step": 479 - }, - { - "epoch": 0.6639004149377593, - "grad_norm": 2.0378098487854004, - "learning_rate": 1.5163293339015535e-05, - "loss": 1.4816, - "step": 480 - }, - { - "epoch": 0.665283540802213, - "grad_norm": 2.1572370529174805, - "learning_rate": 1.5144545420561598e-05, - "loss": 1.4697, - "step": 481 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 1.9824397563934326, - "learning_rate": 1.512577287929288e-05, - "loss": 1.4023, - "step": 482 - }, - { - "epoch": 0.6680497925311203, - "grad_norm": 2.1600515842437744, - "learning_rate": 1.5106975805058483e-05, - "loss": 1.3469, - "step": 483 - }, - { - "epoch": 0.669432918395574, - "grad_norm": 2.0499649047851562, - "learning_rate": 1.5088154287824934e-05, - "loss": 1.4444, - "step": 484 - }, - { - "epoch": 0.6708160442600276, - "grad_norm": 2.1112446784973145, - "learning_rate": 1.506930841767575e-05, - "loss": 1.3261, - "step": 485 - }, - { - "epoch": 0.6721991701244814, - "grad_norm": 2.0517220497131348, - "learning_rate": 1.5050438284811001e-05, - "loss": 1.398, - "step": 486 - }, - { - "epoch": 0.673582295988935, - "grad_norm": 2.2738828659057617, - "learning_rate": 1.5031543979546887e-05, - "loss": 1.3997, - "step": 487 - }, - { - "epoch": 0.6749654218533887, - "grad_norm": 2.024562358856201, - "learning_rate": 1.5012625592315298e-05, - "loss": 1.3629, - "step": 488 - }, - { - "epoch": 0.6763485477178424, - "grad_norm": 2.437912940979004, - "learning_rate": 1.499368321366339e-05, - "loss": 1.3036, - "step": 489 - }, - { - "epoch": 0.677731673582296, - "grad_norm": 1.9140431880950928, - "learning_rate": 1.4974716934253146e-05, - "loss": 1.446, - "step": 490 - }, - { - "epoch": 0.6791147994467497, - "grad_norm": 2.439692974090576, - "learning_rate": 1.4955726844860939e-05, - "loss": 1.453, - "step": 491 - }, - { - "epoch": 0.6804979253112033, - "grad_norm": 2.154038906097412, - "learning_rate": 1.4936713036377102e-05, - "loss": 1.3837, - "step": 492 - }, - { - "epoch": 0.681881051175657, - "grad_norm": 2.1363091468811035, - "learning_rate": 1.4917675599805497e-05, - "loss": 1.3759, - "step": 493 - }, - { - "epoch": 0.6832641770401107, - "grad_norm": 2.051652431488037, - "learning_rate": 1.4898614626263066e-05, - "loss": 1.3556, - "step": 494 - }, - { - "epoch": 0.6846473029045643, - "grad_norm": 2.1247611045837402, - "learning_rate": 1.4879530206979418e-05, - "loss": 1.3635, - "step": 495 - }, - { - "epoch": 0.686030428769018, - "grad_norm": 2.284594774246216, - "learning_rate": 1.4860422433296363e-05, - "loss": 1.4161, - "step": 496 - }, - { - "epoch": 0.6874135546334716, - "grad_norm": 2.339827060699463, - "learning_rate": 1.4841291396667494e-05, - "loss": 1.4127, - "step": 497 - }, - { - "epoch": 0.6887966804979253, - "grad_norm": 2.558943033218384, - "learning_rate": 1.4822137188657752e-05, - "loss": 1.384, - "step": 498 - }, - { - "epoch": 0.690179806362379, - "grad_norm": 2.1640422344207764, - "learning_rate": 1.4802959900942967e-05, - "loss": 1.3584, - "step": 499 - }, - { - "epoch": 0.6915629322268326, - "grad_norm": 2.1310927867889404, - "learning_rate": 1.4783759625309454e-05, - "loss": 1.375, - "step": 500 - }, - { - "epoch": 0.6929460580912863, - "grad_norm": 2.0568363666534424, - "learning_rate": 1.4764536453653536e-05, - "loss": 1.383, - "step": 501 - }, - { - "epoch": 0.69432918395574, - "grad_norm": 2.259617567062378, - "learning_rate": 1.474529047798112e-05, - "loss": 1.3432, - "step": 502 - }, - { - "epoch": 0.6957123098201936, - "grad_norm": 2.0728511810302734, - "learning_rate": 1.4726021790407268e-05, - "loss": 1.4592, - "step": 503 - }, - { - "epoch": 0.6970954356846473, - "grad_norm": 2.205367088317871, - "learning_rate": 1.4706730483155738e-05, - "loss": 1.4189, - "step": 504 - }, - { - "epoch": 0.6984785615491009, - "grad_norm": 2.147625207901001, - "learning_rate": 1.4687416648558555e-05, - "loss": 1.4086, - "step": 505 - }, - { - "epoch": 0.6998616874135546, - "grad_norm": 2.062960624694824, - "learning_rate": 1.4668080379055563e-05, - "loss": 1.4586, - "step": 506 - }, - { - "epoch": 0.7012448132780082, - "grad_norm": 2.2476038932800293, - "learning_rate": 1.4648721767193981e-05, - "loss": 1.4667, - "step": 507 - }, - { - "epoch": 0.7026279391424619, - "grad_norm": 2.2841742038726807, - "learning_rate": 1.4629340905627964e-05, - "loss": 1.3349, - "step": 508 - }, - { - "epoch": 0.7040110650069157, - "grad_norm": 2.1537253856658936, - "learning_rate": 1.4609937887118165e-05, - "loss": 1.4284, - "step": 509 - }, - { - "epoch": 0.7053941908713693, - "grad_norm": 2.1182830333709717, - "learning_rate": 1.4590512804531272e-05, - "loss": 1.4696, - "step": 510 - }, - { - "epoch": 0.706777316735823, - "grad_norm": 1.9890772104263306, - "learning_rate": 1.4571065750839586e-05, - "loss": 1.4727, - "step": 511 - }, - { - "epoch": 0.7081604426002767, - "grad_norm": 2.303697347640991, - "learning_rate": 1.4551596819120564e-05, - "loss": 1.3592, - "step": 512 - }, - { - "epoch": 0.7095435684647303, - "grad_norm": 2.0708582401275635, - "learning_rate": 1.4532106102556377e-05, - "loss": 1.3987, - "step": 513 - }, - { - "epoch": 0.710926694329184, - "grad_norm": 2.2212769985198975, - "learning_rate": 1.4512593694433455e-05, - "loss": 1.3956, - "step": 514 - }, - { - "epoch": 0.7123098201936376, - "grad_norm": 2.004349946975708, - "learning_rate": 1.4493059688142055e-05, - "loss": 1.3919, - "step": 515 - }, - { - "epoch": 0.7136929460580913, - "grad_norm": 2.043989658355713, - "learning_rate": 1.447350417717581e-05, - "loss": 1.5023, - "step": 516 - }, - { - "epoch": 0.715076071922545, - "grad_norm": 2.0797524452209473, - "learning_rate": 1.445392725513127e-05, - "loss": 1.4577, - "step": 517 - }, - { - "epoch": 0.7164591977869986, - "grad_norm": 2.4839534759521484, - "learning_rate": 1.4434329015707468e-05, - "loss": 1.4462, - "step": 518 - }, - { - "epoch": 0.7178423236514523, - "grad_norm": 2.393336057662964, - "learning_rate": 1.4414709552705465e-05, - "loss": 1.3544, - "step": 519 - }, - { - "epoch": 0.719225449515906, - "grad_norm": 2.1947097778320312, - "learning_rate": 1.4395068960027903e-05, - "loss": 1.3591, - "step": 520 - }, - { - "epoch": 0.7206085753803596, - "grad_norm": 2.231341600418091, - "learning_rate": 1.4375407331678553e-05, - "loss": 1.4389, - "step": 521 - }, - { - "epoch": 0.7219917012448133, - "grad_norm": 2.1521496772766113, - "learning_rate": 1.435572476176187e-05, - "loss": 1.3715, - "step": 522 - }, - { - "epoch": 0.7233748271092669, - "grad_norm": 2.3095715045928955, - "learning_rate": 1.4336021344482539e-05, - "loss": 1.4191, - "step": 523 - }, - { - "epoch": 0.7247579529737206, - "grad_norm": 2.364912986755371, - "learning_rate": 1.4316297174145018e-05, - "loss": 1.3801, - "step": 524 - }, - { - "epoch": 0.7261410788381742, - "grad_norm": 2.283719301223755, - "learning_rate": 1.4296552345153099e-05, - "loss": 1.3237, - "step": 525 - }, - { - "epoch": 0.7275242047026279, - "grad_norm": 2.2587459087371826, - "learning_rate": 1.427678695200945e-05, - "loss": 1.3574, - "step": 526 - }, - { - "epoch": 0.7289073305670816, - "grad_norm": 2.1912143230438232, - "learning_rate": 1.4257001089315173e-05, - "loss": 1.3736, - "step": 527 - }, - { - "epoch": 0.7302904564315352, - "grad_norm": 2.349207639694214, - "learning_rate": 1.4237194851769318e-05, - "loss": 1.4744, - "step": 528 - }, - { - "epoch": 0.7316735822959889, - "grad_norm": 2.1632063388824463, - "learning_rate": 1.4217368334168472e-05, - "loss": 1.4335, - "step": 529 - }, - { - "epoch": 0.7330567081604425, - "grad_norm": 2.3758816719055176, - "learning_rate": 1.4197521631406279e-05, - "loss": 1.3985, - "step": 530 - }, - { - "epoch": 0.7344398340248963, - "grad_norm": 2.2720682621002197, - "learning_rate": 1.4177654838472996e-05, - "loss": 1.2187, - "step": 531 - }, - { - "epoch": 0.73582295988935, - "grad_norm": 2.222930908203125, - "learning_rate": 1.4157768050455038e-05, - "loss": 1.3715, - "step": 532 - }, - { - "epoch": 0.7372060857538036, - "grad_norm": 2.493760824203491, - "learning_rate": 1.4137861362534513e-05, - "loss": 1.3032, - "step": 533 - }, - { - "epoch": 0.7385892116182573, - "grad_norm": 2.4699270725250244, - "learning_rate": 1.4117934869988776e-05, - "loss": 1.2739, - "step": 534 - }, - { - "epoch": 0.739972337482711, - "grad_norm": 2.443169593811035, - "learning_rate": 1.4097988668189977e-05, - "loss": 1.4135, - "step": 535 - }, - { - "epoch": 0.7413554633471646, - "grad_norm": 2.4985241889953613, - "learning_rate": 1.4078022852604591e-05, - "loss": 1.331, - "step": 536 - }, - { - "epoch": 0.7427385892116183, - "grad_norm": 2.4425034523010254, - "learning_rate": 1.4058037518792975e-05, - "loss": 1.4912, - "step": 537 - }, - { - "epoch": 0.7441217150760719, - "grad_norm": 2.4378395080566406, - "learning_rate": 1.4038032762408897e-05, - "loss": 1.4527, - "step": 538 - }, - { - "epoch": 0.7455048409405256, - "grad_norm": 2.427049160003662, - "learning_rate": 1.4018008679199092e-05, - "loss": 1.269, - "step": 539 - }, - { - "epoch": 0.7468879668049793, - "grad_norm": 2.3256077766418457, - "learning_rate": 1.3997965365002789e-05, - "loss": 1.3976, - "step": 540 - }, - { - "epoch": 0.7482710926694329, - "grad_norm": 2.26210880279541, - "learning_rate": 1.3977902915751268e-05, - "loss": 1.3096, - "step": 541 - }, - { - "epoch": 0.7496542185338866, - "grad_norm": 2.7226202487945557, - "learning_rate": 1.3957821427467392e-05, - "loss": 1.4281, - "step": 542 - }, - { - "epoch": 0.7510373443983402, - "grad_norm": 2.3421599864959717, - "learning_rate": 1.3937720996265147e-05, - "loss": 1.3555, - "step": 543 - }, - { - "epoch": 0.7510373443983402, - "eval_loss": 1.1585720777511597, - "eval_runtime": 108.9727, - "eval_samples_per_second": 13.251, - "eval_steps_per_second": 0.835, - "step": 543 - }, - { - "epoch": 0.7524204702627939, - "grad_norm": 2.397235870361328, - "learning_rate": 1.3917601718349183e-05, - "loss": 1.362, - "step": 544 - }, - { - "epoch": 0.7538035961272476, - "grad_norm": 2.4912915229797363, - "learning_rate": 1.3897463690014353e-05, - "loss": 1.2447, - "step": 545 - }, - { - "epoch": 0.7551867219917012, - "grad_norm": 2.4246633052825928, - "learning_rate": 1.3877307007645256e-05, - "loss": 1.4679, - "step": 546 - }, - { - "epoch": 0.7565698478561549, - "grad_norm": 2.182666778564453, - "learning_rate": 1.385713176771577e-05, - "loss": 1.1187, - "step": 547 - }, - { - "epoch": 0.7579529737206085, - "grad_norm": 2.2946391105651855, - "learning_rate": 1.3836938066788599e-05, - "loss": 1.3262, - "step": 548 - }, - { - "epoch": 0.7593360995850622, - "grad_norm": 2.28464412689209, - "learning_rate": 1.3816726001514802e-05, - "loss": 1.4728, - "step": 549 - }, - { - "epoch": 0.7607192254495159, - "grad_norm": 2.2874138355255127, - "learning_rate": 1.3796495668633325e-05, - "loss": 1.3737, - "step": 550 - }, - { - "epoch": 0.7621023513139695, - "grad_norm": 2.259789228439331, - "learning_rate": 1.377624716497056e-05, - "loss": 1.3688, - "step": 551 - }, - { - "epoch": 0.7634854771784232, - "grad_norm": 2.3231420516967773, - "learning_rate": 1.3755980587439857e-05, - "loss": 1.4047, - "step": 552 - }, - { - "epoch": 0.7648686030428768, - "grad_norm": 2.470014810562134, - "learning_rate": 1.3735696033041079e-05, - "loss": 1.4894, - "step": 553 - }, - { - "epoch": 0.7662517289073306, - "grad_norm": 2.389801263809204, - "learning_rate": 1.3715393598860129e-05, - "loss": 1.3389, - "step": 554 - }, - { - "epoch": 0.7676348547717843, - "grad_norm": 2.190786123275757, - "learning_rate": 1.369507338206848e-05, - "loss": 1.3945, - "step": 555 - }, - { - "epoch": 0.7690179806362379, - "grad_norm": 2.5276217460632324, - "learning_rate": 1.367473547992272e-05, - "loss": 1.3594, - "step": 556 - }, - { - "epoch": 0.7704011065006916, - "grad_norm": 2.6756093502044678, - "learning_rate": 1.3654379989764084e-05, - "loss": 1.3529, - "step": 557 - }, - { - "epoch": 0.7717842323651453, - "grad_norm": 2.585706949234009, - "learning_rate": 1.3634007009017986e-05, - "loss": 1.3762, - "step": 558 - }, - { - "epoch": 0.7731673582295989, - "grad_norm": 2.3161914348602295, - "learning_rate": 1.3613616635193551e-05, - "loss": 1.4073, - "step": 559 - }, - { - "epoch": 0.7745504840940526, - "grad_norm": 2.3607048988342285, - "learning_rate": 1.3593208965883156e-05, - "loss": 1.4318, - "step": 560 - }, - { - "epoch": 0.7759336099585062, - "grad_norm": 2.048332691192627, - "learning_rate": 1.357278409876195e-05, - "loss": 1.1218, - "step": 561 - }, - { - "epoch": 0.7773167358229599, - "grad_norm": 2.6801364421844482, - "learning_rate": 1.3552342131587399e-05, - "loss": 1.4298, - "step": 562 - }, - { - "epoch": 0.7786998616874136, - "grad_norm": 2.579087972640991, - "learning_rate": 1.3531883162198815e-05, - "loss": 1.3948, - "step": 563 - }, - { - "epoch": 0.7800829875518672, - "grad_norm": 2.5193817615509033, - "learning_rate": 1.351140728851688e-05, - "loss": 1.347, - "step": 564 - }, - { - "epoch": 0.7814661134163209, - "grad_norm": 2.2663426399230957, - "learning_rate": 1.3490914608543189e-05, - "loss": 1.4367, - "step": 565 - }, - { - "epoch": 0.7828492392807745, - "grad_norm": 2.56097412109375, - "learning_rate": 1.3470405220359773e-05, - "loss": 1.256, - "step": 566 - }, - { - "epoch": 0.7842323651452282, - "grad_norm": 2.6170148849487305, - "learning_rate": 1.3449879222128628e-05, - "loss": 1.3289, - "step": 567 - }, - { - "epoch": 0.7856154910096819, - "grad_norm": 2.7682836055755615, - "learning_rate": 1.3429336712091258e-05, - "loss": 1.3766, - "step": 568 - }, - { - "epoch": 0.7869986168741355, - "grad_norm": 2.343817949295044, - "learning_rate": 1.340877778856819e-05, - "loss": 1.4599, - "step": 569 - }, - { - "epoch": 0.7883817427385892, - "grad_norm": 2.5406904220581055, - "learning_rate": 1.3388202549958507e-05, - "loss": 1.46, - "step": 570 - }, - { - "epoch": 0.7897648686030428, - "grad_norm": 2.5566861629486084, - "learning_rate": 1.3367611094739384e-05, - "loss": 1.38, - "step": 571 - }, - { - "epoch": 0.7911479944674965, - "grad_norm": 2.6734111309051514, - "learning_rate": 1.334700352146561e-05, - "loss": 1.3167, - "step": 572 - }, - { - "epoch": 0.7925311203319502, - "grad_norm": 2.242243766784668, - "learning_rate": 1.3326379928769114e-05, - "loss": 1.3883, - "step": 573 - }, - { - "epoch": 0.7939142461964038, - "grad_norm": 2.456390142440796, - "learning_rate": 1.3305740415358506e-05, - "loss": 1.4072, - "step": 574 - }, - { - "epoch": 0.7952973720608575, - "grad_norm": 2.449395179748535, - "learning_rate": 1.3285085080018589e-05, - "loss": 1.38, - "step": 575 - }, - { - "epoch": 0.7966804979253111, - "grad_norm": 2.6714186668395996, - "learning_rate": 1.3264414021609899e-05, - "loss": 1.3204, - "step": 576 - }, - { - "epoch": 0.7980636237897649, - "grad_norm": 2.479647397994995, - "learning_rate": 1.3243727339068216e-05, - "loss": 1.3234, - "step": 577 - }, - { - "epoch": 0.7994467496542186, - "grad_norm": 2.5289547443389893, - "learning_rate": 1.3223025131404106e-05, - "loss": 1.3533, - "step": 578 - }, - { - "epoch": 0.8008298755186722, - "grad_norm": 2.6476147174835205, - "learning_rate": 1.3202307497702443e-05, - "loss": 1.3419, - "step": 579 - }, - { - "epoch": 0.8022130013831259, - "grad_norm": 2.7068285942077637, - "learning_rate": 1.3181574537121933e-05, - "loss": 1.3636, - "step": 580 - }, - { - "epoch": 0.8035961272475796, - "grad_norm": 2.6454317569732666, - "learning_rate": 1.3160826348894635e-05, - "loss": 1.352, - "step": 581 - }, - { - "epoch": 0.8049792531120332, - "grad_norm": 2.513296127319336, - "learning_rate": 1.3140063032325491e-05, - "loss": 1.3446, - "step": 582 - }, - { - "epoch": 0.8063623789764869, - "grad_norm": 2.563950538635254, - "learning_rate": 1.3119284686791859e-05, - "loss": 1.4223, - "step": 583 - }, - { - "epoch": 0.8077455048409405, - "grad_norm": 2.553441047668457, - "learning_rate": 1.3098491411743014e-05, - "loss": 1.4375, - "step": 584 - }, - { - "epoch": 0.8091286307053942, - "grad_norm": 2.6420164108276367, - "learning_rate": 1.3077683306699702e-05, - "loss": 1.2831, - "step": 585 - }, - { - "epoch": 0.8105117565698479, - "grad_norm": 2.67268443107605, - "learning_rate": 1.3056860471253639e-05, - "loss": 1.4366, - "step": 586 - }, - { - "epoch": 0.8118948824343015, - "grad_norm": 2.650383472442627, - "learning_rate": 1.3036023005067042e-05, - "loss": 1.2001, - "step": 587 - }, - { - "epoch": 0.8132780082987552, - "grad_norm": 2.6096444129943848, - "learning_rate": 1.3015171007872161e-05, - "loss": 1.4101, - "step": 588 - }, - { - "epoch": 0.8146611341632088, - "grad_norm": 2.8249430656433105, - "learning_rate": 1.2994304579470787e-05, - "loss": 1.3862, - "step": 589 - }, - { - "epoch": 0.8160442600276625, - "grad_norm": 2.864114999771118, - "learning_rate": 1.297342381973379e-05, - "loss": 1.4244, - "step": 590 - }, - { - "epoch": 0.8174273858921162, - "grad_norm": 2.7902991771698, - "learning_rate": 1.2952528828600623e-05, - "loss": 1.3664, - "step": 591 - }, - { - "epoch": 0.8188105117565698, - "grad_norm": 2.8960015773773193, - "learning_rate": 1.2931619706078862e-05, - "loss": 1.3758, - "step": 592 - }, - { - "epoch": 0.8201936376210235, - "grad_norm": 2.313051223754883, - "learning_rate": 1.2910696552243708e-05, - "loss": 1.438, - "step": 593 - }, - { - "epoch": 0.8215767634854771, - "grad_norm": 2.627824544906616, - "learning_rate": 1.2889759467237532e-05, - "loss": 1.3203, - "step": 594 - }, - { - "epoch": 0.8229598893499308, - "grad_norm": 2.7432913780212402, - "learning_rate": 1.2868808551269374e-05, - "loss": 1.3742, - "step": 595 - }, - { - "epoch": 0.8243430152143845, - "grad_norm": 2.6407742500305176, - "learning_rate": 1.2847843904614474e-05, - "loss": 1.3906, - "step": 596 - }, - { - "epoch": 0.8257261410788381, - "grad_norm": 2.8636410236358643, - "learning_rate": 1.2826865627613785e-05, - "loss": 1.3633, - "step": 597 - }, - { - "epoch": 0.8271092669432918, - "grad_norm": 2.397425413131714, - "learning_rate": 1.2805873820673509e-05, - "loss": 1.4116, - "step": 598 - }, - { - "epoch": 0.8284923928077456, - "grad_norm": 2.8172311782836914, - "learning_rate": 1.2784868584264587e-05, - "loss": 1.3482, - "step": 599 - }, - { - "epoch": 0.8298755186721992, - "grad_norm": 2.766284227371216, - "learning_rate": 1.2763850018922257e-05, - "loss": 1.3548, - "step": 600 - }, - { - "epoch": 0.8312586445366529, - "grad_norm": 2.8403494358062744, - "learning_rate": 1.2742818225245538e-05, - "loss": 1.4169, - "step": 601 - }, - { - "epoch": 0.8326417704011065, - "grad_norm": 2.8391079902648926, - "learning_rate": 1.2721773303896765e-05, - "loss": 1.3907, - "step": 602 - }, - { - "epoch": 0.8340248962655602, - "grad_norm": 3.021026372909546, - "learning_rate": 1.2700715355601107e-05, - "loss": 1.2413, - "step": 603 - }, - { - "epoch": 0.8354080221300139, - "grad_norm": 2.439363956451416, - "learning_rate": 1.2679644481146081e-05, - "loss": 1.3966, - "step": 604 - }, - { - "epoch": 0.8367911479944675, - "grad_norm": 2.6423158645629883, - "learning_rate": 1.265856078138107e-05, - "loss": 1.3411, - "step": 605 - }, - { - "epoch": 0.8381742738589212, - "grad_norm": 2.848080635070801, - "learning_rate": 1.2637464357216847e-05, - "loss": 1.3249, - "step": 606 - }, - { - "epoch": 0.8395573997233748, - "grad_norm": 2.5756824016571045, - "learning_rate": 1.2616355309625076e-05, - "loss": 1.4163, - "step": 607 - }, - { - "epoch": 0.8409405255878285, - "grad_norm": 2.7762558460235596, - "learning_rate": 1.2595233739637851e-05, - "loss": 1.3655, - "step": 608 - }, - { - "epoch": 0.8423236514522822, - "grad_norm": 2.599550485610962, - "learning_rate": 1.2574099748347195e-05, - "loss": 1.3372, - "step": 609 - }, - { - "epoch": 0.8437067773167358, - "grad_norm": 2.7273144721984863, - "learning_rate": 1.2552953436904578e-05, - "loss": 1.2719, - "step": 610 - }, - { - "epoch": 0.8450899031811895, - "grad_norm": 2.3901562690734863, - "learning_rate": 1.2531794906520447e-05, - "loss": 1.3207, - "step": 611 - }, - { - "epoch": 0.8464730290456431, - "grad_norm": 2.7380173206329346, - "learning_rate": 1.2510624258463719e-05, - "loss": 1.3461, - "step": 612 - }, - { - "epoch": 0.8478561549100968, - "grad_norm": 2.6389694213867188, - "learning_rate": 1.248944159406132e-05, - "loss": 1.3443, - "step": 613 - }, - { - "epoch": 0.8492392807745505, - "grad_norm": 2.623025417327881, - "learning_rate": 1.246824701469768e-05, - "loss": 1.3893, - "step": 614 - }, - { - "epoch": 0.8506224066390041, - "grad_norm": 2.646023988723755, - "learning_rate": 1.2447040621814262e-05, - "loss": 1.372, - "step": 615 - }, - { - "epoch": 0.8520055325034578, - "grad_norm": 2.4752604961395264, - "learning_rate": 1.2425822516909065e-05, - "loss": 1.413, - "step": 616 - }, - { - "epoch": 0.8533886583679114, - "grad_norm": 2.928985834121704, - "learning_rate": 1.2404592801536151e-05, - "loss": 1.3141, - "step": 617 - }, - { - "epoch": 0.8547717842323651, - "grad_norm": 2.6695446968078613, - "learning_rate": 1.2383351577305148e-05, - "loss": 1.3596, - "step": 618 - }, - { - "epoch": 0.8561549100968188, - "grad_norm": 2.6358587741851807, - "learning_rate": 1.2362098945880765e-05, - "loss": 1.2506, - "step": 619 - }, - { - "epoch": 0.8575380359612724, - "grad_norm": 2.5874736309051514, - "learning_rate": 1.2340835008982315e-05, - "loss": 1.3993, - "step": 620 - }, - { - "epoch": 0.8589211618257261, - "grad_norm": 2.909454822540283, - "learning_rate": 1.2319559868383215e-05, - "loss": 1.3344, - "step": 621 - }, - { - "epoch": 0.8603042876901799, - "grad_norm": 2.6511762142181396, - "learning_rate": 1.2298273625910512e-05, - "loss": 1.3711, - "step": 622 - }, - { - "epoch": 0.8616874135546335, - "grad_norm": 2.508411169052124, - "learning_rate": 1.2276976383444384e-05, - "loss": 1.3315, - "step": 623 - }, - { - "epoch": 0.8630705394190872, - "grad_norm": 2.6246912479400635, - "learning_rate": 1.2255668242917651e-05, - "loss": 1.3493, - "step": 624 - }, - { - "epoch": 0.8644536652835408, - "grad_norm": 2.5388360023498535, - "learning_rate": 1.2234349306315308e-05, - "loss": 1.394, - "step": 625 - }, - { - "epoch": 0.8658367911479945, - "grad_norm": 2.443962812423706, - "learning_rate": 1.2213019675674008e-05, - "loss": 1.3458, - "step": 626 - }, - { - "epoch": 0.8672199170124482, - "grad_norm": 2.2994024753570557, - "learning_rate": 1.2191679453081598e-05, - "loss": 1.1168, - "step": 627 - }, - { - "epoch": 0.8686030428769018, - "grad_norm": 2.854074239730835, - "learning_rate": 1.2170328740676613e-05, - "loss": 1.4029, - "step": 628 - }, - { - "epoch": 0.8699861687413555, - "grad_norm": 2.6301426887512207, - "learning_rate": 1.2148967640647801e-05, - "loss": 1.2258, - "step": 629 - }, - { - "epoch": 0.8713692946058091, - "grad_norm": 2.680180311203003, - "learning_rate": 1.2127596255233622e-05, - "loss": 1.324, - "step": 630 - }, - { - "epoch": 0.8727524204702628, - "grad_norm": 2.532710075378418, - "learning_rate": 1.2106214686721763e-05, - "loss": 1.3227, - "step": 631 - }, - { - "epoch": 0.8741355463347165, - "grad_norm": 2.6774704456329346, - "learning_rate": 1.2084823037448654e-05, - "loss": 1.3242, - "step": 632 - }, - { - "epoch": 0.8755186721991701, - "grad_norm": 2.6629583835601807, - "learning_rate": 1.2063421409798974e-05, - "loss": 1.3086, - "step": 633 - }, - { - "epoch": 0.8769017980636238, - "grad_norm": 2.9211409091949463, - "learning_rate": 1.2042009906205152e-05, - "loss": 1.2639, - "step": 634 - }, - { - "epoch": 0.8782849239280774, - "grad_norm": 2.9951515197753906, - "learning_rate": 1.2020588629146897e-05, - "loss": 1.3449, - "step": 635 - }, - { - "epoch": 0.8796680497925311, - "grad_norm": 3.1883134841918945, - "learning_rate": 1.1999157681150683e-05, - "loss": 1.3168, - "step": 636 - }, - { - "epoch": 0.8810511756569848, - "grad_norm": 2.857069253921509, - "learning_rate": 1.1977717164789286e-05, - "loss": 1.3151, - "step": 637 - }, - { - "epoch": 0.8824343015214384, - "grad_norm": 2.866753339767456, - "learning_rate": 1.1956267182681265e-05, - "loss": 1.4015, - "step": 638 - }, - { - "epoch": 0.8838174273858921, - "grad_norm": 2.751404047012329, - "learning_rate": 1.193480783749049e-05, - "loss": 1.417, - "step": 639 - }, - { - "epoch": 0.8852005532503457, - "grad_norm": 2.753148078918457, - "learning_rate": 1.1913339231925642e-05, - "loss": 1.2989, - "step": 640 - }, - { - "epoch": 0.8865836791147994, - "grad_norm": 2.9508845806121826, - "learning_rate": 1.1891861468739729e-05, - "loss": 1.3506, - "step": 641 - }, - { - "epoch": 0.8879668049792531, - "grad_norm": 2.8244197368621826, - "learning_rate": 1.1870374650729582e-05, - "loss": 1.3815, - "step": 642 - }, - { - "epoch": 0.8893499308437067, - "grad_norm": 2.7276086807250977, - "learning_rate": 1.1848878880735374e-05, - "loss": 1.4006, - "step": 643 - }, - { - "epoch": 0.8907330567081605, - "grad_norm": 2.7729735374450684, - "learning_rate": 1.1827374261640128e-05, - "loss": 1.3746, - "step": 644 - }, - { - "epoch": 0.8921161825726142, - "grad_norm": 3.0188214778900146, - "learning_rate": 1.1805860896369212e-05, - "loss": 1.2675, - "step": 645 - }, - { - "epoch": 0.8934993084370678, - "grad_norm": 2.7433032989501953, - "learning_rate": 1.1784338887889858e-05, - "loss": 1.406, - "step": 646 - }, - { - "epoch": 0.8948824343015215, - "grad_norm": 3.0397491455078125, - "learning_rate": 1.1762808339210672e-05, - "loss": 1.2682, - "step": 647 - }, - { - "epoch": 0.8962655601659751, - "grad_norm": 2.876121997833252, - "learning_rate": 1.1741269353381128e-05, - "loss": 1.4072, - "step": 648 - }, - { - "epoch": 0.8976486860304288, - "grad_norm": 2.9146461486816406, - "learning_rate": 1.1719722033491086e-05, - "loss": 1.3016, - "step": 649 - }, - { - "epoch": 0.8990318118948825, - "grad_norm": 2.6939985752105713, - "learning_rate": 1.1698166482670293e-05, - "loss": 1.4551, - "step": 650 - }, - { - "epoch": 0.9004149377593361, - "grad_norm": 2.525695323944092, - "learning_rate": 1.1676602804087887e-05, - "loss": 1.3592, - "step": 651 - }, - { - "epoch": 0.9017980636237898, - "grad_norm": 2.9363038539886475, - "learning_rate": 1.165503110095191e-05, - "loss": 1.3402, - "step": 652 - }, - { - "epoch": 0.9031811894882434, - "grad_norm": 2.9058566093444824, - "learning_rate": 1.1633451476508819e-05, - "loss": 1.3014, - "step": 653 - }, - { - "epoch": 0.9045643153526971, - "grad_norm": 2.6156883239746094, - "learning_rate": 1.1611864034042972e-05, - "loss": 1.4221, - "step": 654 - }, - { - "epoch": 0.9059474412171508, - "grad_norm": 2.717287302017212, - "learning_rate": 1.1590268876876151e-05, - "loss": 1.2827, - "step": 655 - }, - { - "epoch": 0.9073305670816044, - "grad_norm": 2.8226826190948486, - "learning_rate": 1.1568666108367066e-05, - "loss": 1.2393, - "step": 656 - }, - { - "epoch": 0.9087136929460581, - "grad_norm": 3.023271322250366, - "learning_rate": 1.1547055831910841e-05, - "loss": 1.2443, - "step": 657 - }, - { - "epoch": 0.9100968188105117, - "grad_norm": 3.041276693344116, - "learning_rate": 1.1525438150938554e-05, - "loss": 1.3386, - "step": 658 - }, - { - "epoch": 0.9114799446749654, - "grad_norm": 3.0393760204315186, - "learning_rate": 1.1503813168916715e-05, - "loss": 1.4778, - "step": 659 - }, - { - "epoch": 0.9128630705394191, - "grad_norm": 2.7392125129699707, - "learning_rate": 1.1482180989346771e-05, - "loss": 1.3789, - "step": 660 - }, - { - "epoch": 0.9142461964038727, - "grad_norm": 2.8566339015960693, - "learning_rate": 1.1460541715764628e-05, - "loss": 1.4119, - "step": 661 - }, - { - "epoch": 0.9156293222683264, - "grad_norm": 2.5865392684936523, - "learning_rate": 1.1438895451740141e-05, - "loss": 1.4265, - "step": 662 - }, - { - "epoch": 0.91701244813278, - "grad_norm": 2.861591100692749, - "learning_rate": 1.1417242300876621e-05, - "loss": 1.3768, - "step": 663 - }, - { - "epoch": 0.9183955739972337, - "grad_norm": 2.6707632541656494, - "learning_rate": 1.1395582366810348e-05, - "loss": 1.3279, - "step": 664 - }, - { - "epoch": 0.9197786998616874, - "grad_norm": 3.7327070236206055, - "learning_rate": 1.1373915753210056e-05, - "loss": 1.34, - "step": 665 - }, - { - "epoch": 0.921161825726141, - "grad_norm": 2.8060691356658936, - "learning_rate": 1.135224256377646e-05, - "loss": 1.3169, - "step": 666 - }, - { - "epoch": 0.9225449515905948, - "grad_norm": 2.822150707244873, - "learning_rate": 1.1330562902241742e-05, - "loss": 1.4016, - "step": 667 - }, - { - "epoch": 0.9239280774550485, - "grad_norm": 3.038985252380371, - "learning_rate": 1.1308876872369062e-05, - "loss": 1.2763, - "step": 668 - }, - { - "epoch": 0.9253112033195021, - "grad_norm": 2.8417439460754395, - "learning_rate": 1.128718457795206e-05, - "loss": 1.3405, - "step": 669 - }, - { - "epoch": 0.9266943291839558, - "grad_norm": 2.795173406600952, - "learning_rate": 1.1265486122814359e-05, - "loss": 1.3664, - "step": 670 - }, - { - "epoch": 0.9280774550484094, - "grad_norm": 2.7899844646453857, - "learning_rate": 1.124378161080907e-05, - "loss": 1.3715, - "step": 671 - }, - { - "epoch": 0.9294605809128631, - "grad_norm": 2.6709322929382324, - "learning_rate": 1.1222071145818293e-05, - "loss": 1.3386, - "step": 672 - }, - { - "epoch": 0.9308437067773168, - "grad_norm": 2.7561426162719727, - "learning_rate": 1.120035483175262e-05, - "loss": 1.3753, - "step": 673 - }, - { - "epoch": 0.9322268326417704, - "grad_norm": 2.7683603763580322, - "learning_rate": 1.1178632772550636e-05, - "loss": 1.2831, - "step": 674 - }, - { - "epoch": 0.9336099585062241, - "grad_norm": 3.2102935314178467, - "learning_rate": 1.1156905072178425e-05, - "loss": 1.2388, - "step": 675 - }, - { - "epoch": 0.9349930843706777, - "grad_norm": 2.959686040878296, - "learning_rate": 1.113517183462907e-05, - "loss": 1.3251, - "step": 676 - }, - { - "epoch": 0.9363762102351314, - "grad_norm": 2.9151217937469482, - "learning_rate": 1.1113433163922161e-05, - "loss": 1.3598, - "step": 677 - }, - { - "epoch": 0.9377593360995851, - "grad_norm": 2.778136730194092, - "learning_rate": 1.1091689164103281e-05, - "loss": 1.3768, - "step": 678 - }, - { - "epoch": 0.9391424619640387, - "grad_norm": 2.6228294372558594, - "learning_rate": 1.1069939939243531e-05, - "loss": 1.2864, - "step": 679 - }, - { - "epoch": 0.9405255878284924, - "grad_norm": 2.5928773880004883, - "learning_rate": 1.1048185593439014e-05, - "loss": 1.3992, - "step": 680 - }, - { - "epoch": 0.941908713692946, - "grad_norm": 3.3055930137634277, - "learning_rate": 1.1026426230810342e-05, - "loss": 1.2491, - "step": 681 - }, - { - "epoch": 0.9432918395573997, - "grad_norm": 3.021859884262085, - "learning_rate": 1.1004661955502143e-05, - "loss": 1.3086, - "step": 682 - }, - { - "epoch": 0.9446749654218534, - "grad_norm": 3.3848326206207275, - "learning_rate": 1.0982892871682556e-05, - "loss": 1.363, - "step": 683 - }, - { - "epoch": 0.946058091286307, - "grad_norm": 3.09657883644104, - "learning_rate": 1.0961119083542727e-05, - "loss": 1.2815, - "step": 684 - }, - { - "epoch": 0.9474412171507607, - "grad_norm": 2.734449625015259, - "learning_rate": 1.0939340695296332e-05, - "loss": 1.326, - "step": 685 - }, - { - "epoch": 0.9488243430152143, - "grad_norm": 3.091350555419922, - "learning_rate": 1.0917557811179057e-05, - "loss": 1.3682, - "step": 686 - }, - { - "epoch": 0.950207468879668, - "grad_norm": 2.9399776458740234, - "learning_rate": 1.0895770535448103e-05, - "loss": 1.1755, - "step": 687 - }, - { - "epoch": 0.9515905947441217, - "grad_norm": 3.312972068786621, - "learning_rate": 1.0873978972381692e-05, - "loss": 1.4613, - "step": 688 - }, - { - "epoch": 0.9529737206085753, - "grad_norm": 3.2397265434265137, - "learning_rate": 1.0852183226278568e-05, - "loss": 1.2265, - "step": 689 - }, - { - "epoch": 0.9543568464730291, - "grad_norm": 3.401766300201416, - "learning_rate": 1.0830383401457499e-05, - "loss": 1.1583, - "step": 690 - }, - { - "epoch": 0.9557399723374828, - "grad_norm": 3.0945234298706055, - "learning_rate": 1.0808579602256766e-05, - "loss": 1.2549, - "step": 691 - }, - { - "epoch": 0.9571230982019364, - "grad_norm": 2.892479658126831, - "learning_rate": 1.0786771933033677e-05, - "loss": 1.2499, - "step": 692 - }, - { - "epoch": 0.9585062240663901, - "grad_norm": 3.107635736465454, - "learning_rate": 1.0764960498164066e-05, - "loss": 1.3087, - "step": 693 - }, - { - "epoch": 0.9598893499308437, - "grad_norm": 3.117335319519043, - "learning_rate": 1.0743145402041781e-05, - "loss": 1.2303, - "step": 694 - }, - { - "epoch": 0.9612724757952974, - "grad_norm": 3.034398078918457, - "learning_rate": 1.0721326749078205e-05, - "loss": 1.315, - "step": 695 - }, - { - "epoch": 0.9626556016597511, - "grad_norm": 3.142618417739868, - "learning_rate": 1.0699504643701732e-05, - "loss": 1.3106, - "step": 696 - }, - { - "epoch": 0.9640387275242047, - "grad_norm": 2.986525535583496, - "learning_rate": 1.0677679190357292e-05, - "loss": 1.1909, - "step": 697 - }, - { - "epoch": 0.9654218533886584, - "grad_norm": 3.2569093704223633, - "learning_rate": 1.0655850493505834e-05, - "loss": 1.2402, - "step": 698 - }, - { - "epoch": 0.966804979253112, - "grad_norm": 3.035935163497925, - "learning_rate": 1.0634018657623827e-05, - "loss": 1.3137, - "step": 699 - }, - { - "epoch": 0.9681881051175657, - "grad_norm": 2.9152543544769287, - "learning_rate": 1.0612183787202768e-05, - "loss": 1.299, - "step": 700 - }, - { - "epoch": 0.9695712309820194, - "grad_norm": 3.0205094814300537, - "learning_rate": 1.059034598674868e-05, - "loss": 1.2816, - "step": 701 - }, - { - "epoch": 0.970954356846473, - "grad_norm": 3.073659658432007, - "learning_rate": 1.0568505360781606e-05, - "loss": 1.2782, - "step": 702 - }, - { - "epoch": 0.9723374827109267, - "grad_norm": 3.1953279972076416, - "learning_rate": 1.0546662013835119e-05, - "loss": 1.3486, - "step": 703 - }, - { - "epoch": 0.9737206085753803, - "grad_norm": 3.613943576812744, - "learning_rate": 1.0524816050455801e-05, - "loss": 1.1737, - "step": 704 - }, - { - "epoch": 0.975103734439834, - "grad_norm": 3.117497682571411, - "learning_rate": 1.0502967575202769e-05, - "loss": 1.3392, - "step": 705 - }, - { - "epoch": 0.9764868603042877, - "grad_norm": 3.1448299884796143, - "learning_rate": 1.0481116692647165e-05, - "loss": 1.2706, - "step": 706 - }, - { - "epoch": 0.9778699861687413, - "grad_norm": 2.8863043785095215, - "learning_rate": 1.045926350737164e-05, - "loss": 1.3248, - "step": 707 - }, - { - "epoch": 0.979253112033195, - "grad_norm": 3.1689250469207764, - "learning_rate": 1.0437408123969877e-05, - "loss": 1.2143, - "step": 708 - }, - { - "epoch": 0.9806362378976486, - "grad_norm": 2.828787088394165, - "learning_rate": 1.0415550647046074e-05, - "loss": 1.3921, - "step": 709 - }, - { - "epoch": 0.9820193637621023, - "grad_norm": 2.9206900596618652, - "learning_rate": 1.039369118121445e-05, - "loss": 1.4469, - "step": 710 - }, - { - "epoch": 0.983402489626556, - "grad_norm": 2.994562864303589, - "learning_rate": 1.0371829831098747e-05, - "loss": 1.3058, - "step": 711 - }, - { - "epoch": 0.9847856154910097, - "grad_norm": 2.9757678508758545, - "learning_rate": 1.0349966701331721e-05, - "loss": 1.3384, - "step": 712 - }, - { - "epoch": 0.9861687413554634, - "grad_norm": 3.144249200820923, - "learning_rate": 1.0328101896554647e-05, - "loss": 1.3066, - "step": 713 - }, - { - "epoch": 0.9875518672199171, - "grad_norm": 3.0815372467041016, - "learning_rate": 1.0306235521416822e-05, - "loss": 1.3301, - "step": 714 - }, - { - "epoch": 0.9889349930843707, - "grad_norm": 2.8530967235565186, - "learning_rate": 1.0284367680575045e-05, - "loss": 1.3116, - "step": 715 - }, - { - "epoch": 0.9903181189488244, - "grad_norm": 2.911203145980835, - "learning_rate": 1.0262498478693148e-05, - "loss": 1.3735, - "step": 716 - }, - { - "epoch": 0.991701244813278, - "grad_norm": 2.647660493850708, - "learning_rate": 1.0240628020441468e-05, - "loss": 1.3103, - "step": 717 - }, - { - "epoch": 0.9930843706777317, - "grad_norm": 3.142230749130249, - "learning_rate": 1.0218756410496353e-05, - "loss": 1.2823, - "step": 718 - }, - { - "epoch": 0.9944674965421854, - "grad_norm": 2.798553705215454, - "learning_rate": 1.019688375353967e-05, - "loss": 1.3124, - "step": 719 - }, - { - "epoch": 0.995850622406639, - "grad_norm": 3.3929383754730225, - "learning_rate": 1.0175010154258288e-05, - "loss": 1.2478, - "step": 720 - }, - { - "epoch": 0.9972337482710927, - "grad_norm": 3.2481846809387207, - "learning_rate": 1.0153135717343599e-05, - "loss": 1.2861, - "step": 721 - }, - { - "epoch": 0.9986168741355463, - "grad_norm": 2.964385747909546, - "learning_rate": 1.013126054749099e-05, - "loss": 1.3865, - "step": 722 - }, - { - "epoch": 1.0, - "grad_norm": 3.1757357120513916, - "learning_rate": 1.0109384749399369e-05, - "loss": 1.3613, - "step": 723 - } - ], - "logging_steps": 1, - "max_steps": 1446, - "num_input_tokens_seen": 0, - "num_train_epochs": 2, - "save_steps": 723, - "stateful_callbacks": { - "TrainerControl": { - "args": { - "should_epoch_stop": false, - "should_evaluate": false, - "should_log": false, - "should_save": true, - "should_training_stop": false - }, - "attributes": {} - } - }, - "total_flos": 1.4040967111099023e+18, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -}