|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 501, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.001996007984031936, |
|
"grad_norm": 92.81291961669922, |
|
"learning_rate": 0.0, |
|
"loss": 5.1027, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003992015968063872, |
|
"grad_norm": 187.8679656982422, |
|
"learning_rate": 5.017166594399687e-06, |
|
"loss": 5.1552, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005988023952095809, |
|
"grad_norm": 160.3046875, |
|
"learning_rate": 7.952020911994375e-06, |
|
"loss": 5.1408, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007984031936127744, |
|
"grad_norm": 44.23579406738281, |
|
"learning_rate": 1.0034333188799373e-05, |
|
"loss": 3.2825, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00998003992015968, |
|
"grad_norm": 22.931053161621094, |
|
"learning_rate": 1.164950007226698e-05, |
|
"loss": 2.5601, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.011976047904191617, |
|
"grad_norm": 10.358180046081543, |
|
"learning_rate": 1.2969187506394062e-05, |
|
"loss": 1.994, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.013972055888223553, |
|
"grad_norm": 9.385107040405273, |
|
"learning_rate": 1.4084967333570947e-05, |
|
"loss": 1.8568, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.015968063872255488, |
|
"grad_norm": 7.71799898147583, |
|
"learning_rate": 1.505149978319906e-05, |
|
"loss": 1.59, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.017964071856287425, |
|
"grad_norm": 5.006285190582275, |
|
"learning_rate": 1.590404182398875e-05, |
|
"loss": 1.3238, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.01996007984031936, |
|
"grad_norm": 4.380033493041992, |
|
"learning_rate": 1.666666666666667e-05, |
|
"loss": 1.267, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021956087824351298, |
|
"grad_norm": 4.105769634246826, |
|
"learning_rate": 1.7356544752637084e-05, |
|
"loss": 1.2438, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.023952095808383235, |
|
"grad_norm": 3.476895809173584, |
|
"learning_rate": 1.7986354100793748e-05, |
|
"loss": 1.1196, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.02594810379241517, |
|
"grad_norm": 2.9743728637695312, |
|
"learning_rate": 1.8565722538447282e-05, |
|
"loss": 1.0148, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.027944111776447105, |
|
"grad_norm": 2.6658384799957275, |
|
"learning_rate": 1.9102133927970633e-05, |
|
"loss": 1.0081, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.029940119760479042, |
|
"grad_norm": 2.6062169075012207, |
|
"learning_rate": 1.9601520984261358e-05, |
|
"loss": 0.9228, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.031936127744510975, |
|
"grad_norm": 2.147310495376587, |
|
"learning_rate": 2.0068666377598747e-05, |
|
"loss": 0.8351, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.033932135728542916, |
|
"grad_norm": 2.2878642082214355, |
|
"learning_rate": 2.0507482022971233e-05, |
|
"loss": 0.8303, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03592814371257485, |
|
"grad_norm": 2.077786445617676, |
|
"learning_rate": 2.0921208418388435e-05, |
|
"loss": 0.7769, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03792415169660679, |
|
"grad_norm": 2.115493059158325, |
|
"learning_rate": 2.1312560015880482e-05, |
|
"loss": 0.8032, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03992015968063872, |
|
"grad_norm": 1.92618989944458, |
|
"learning_rate": 2.1683833261066357e-05, |
|
"loss": 0.7759, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.041916167664670656, |
|
"grad_norm": 1.9667437076568604, |
|
"learning_rate": 2.2036988245565324e-05, |
|
"loss": 0.7805, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.043912175648702596, |
|
"grad_norm": 2.3144752979278564, |
|
"learning_rate": 2.2373711347036773e-05, |
|
"loss": 0.735, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04590818363273453, |
|
"grad_norm": 1.555964469909668, |
|
"learning_rate": 2.269546393362655e-05, |
|
"loss": 0.6523, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04790419161676647, |
|
"grad_norm": 1.5024523735046387, |
|
"learning_rate": 2.3003520695193437e-05, |
|
"loss": 0.6623, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0499001996007984, |
|
"grad_norm": 1.522902011871338, |
|
"learning_rate": 2.329900014453396e-05, |
|
"loss": 0.6503, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05189620758483034, |
|
"grad_norm": 1.4194371700286865, |
|
"learning_rate": 2.3582889132846968e-05, |
|
"loss": 0.636, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05389221556886228, |
|
"grad_norm": 1.55453360080719, |
|
"learning_rate": 2.3856062735983123e-05, |
|
"loss": 0.7242, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.05588822355289421, |
|
"grad_norm": 1.4471536874771118, |
|
"learning_rate": 2.4119300522370322e-05, |
|
"loss": 0.5819, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05788423153692615, |
|
"grad_norm": 1.5161927938461304, |
|
"learning_rate": 2.4373299964982603e-05, |
|
"loss": 0.6788, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.059880239520958084, |
|
"grad_norm": 1.5962581634521484, |
|
"learning_rate": 2.4618687578661044e-05, |
|
"loss": 0.7346, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06187624750499002, |
|
"grad_norm": 1.3744760751724243, |
|
"learning_rate": 2.4856028230571212e-05, |
|
"loss": 0.4835, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.06387225548902195, |
|
"grad_norm": 1.7308415174484253, |
|
"learning_rate": 2.5085832971998436e-05, |
|
"loss": 0.6537, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0658682634730539, |
|
"grad_norm": 1.736331582069397, |
|
"learning_rate": 2.530856566463146e-05, |
|
"loss": 0.6633, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06786427145708583, |
|
"grad_norm": 1.9755364656448364, |
|
"learning_rate": 2.552464861737092e-05, |
|
"loss": 0.6268, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06986027944111776, |
|
"grad_norm": 1.6539369821548462, |
|
"learning_rate": 2.5734467405837933e-05, |
|
"loss": 0.6355, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0718562874251497, |
|
"grad_norm": 2.0570621490478516, |
|
"learning_rate": 2.5938375012788124e-05, |
|
"loss": 0.6168, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07385229540918163, |
|
"grad_norm": 1.8512474298477173, |
|
"learning_rate": 2.6136695401116585e-05, |
|
"loss": 0.6583, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.07584830339321358, |
|
"grad_norm": 1.2911862134933472, |
|
"learning_rate": 2.6329726610280168e-05, |
|
"loss": 0.565, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07784431137724551, |
|
"grad_norm": 1.475156545639038, |
|
"learning_rate": 2.651774345044166e-05, |
|
"loss": 0.6409, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07984031936127745, |
|
"grad_norm": 1.1098164319992065, |
|
"learning_rate": 2.6700999855466042e-05, |
|
"loss": 0.5335, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08183632734530938, |
|
"grad_norm": 1.1890451908111572, |
|
"learning_rate": 2.687973094532893e-05, |
|
"loss": 0.4502, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.08383233532934131, |
|
"grad_norm": 1.9120031595230103, |
|
"learning_rate": 2.7054154839965013e-05, |
|
"loss": 0.607, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08582834331337326, |
|
"grad_norm": 1.2188658714294434, |
|
"learning_rate": 2.722447425965978e-05, |
|
"loss": 0.5033, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.08782435129740519, |
|
"grad_norm": 1.3608094453811646, |
|
"learning_rate": 2.739087794143646e-05, |
|
"loss": 0.5956, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08982035928143713, |
|
"grad_norm": 1.259487271308899, |
|
"learning_rate": 2.755354189625573e-05, |
|
"loss": 0.575, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.09181636726546906, |
|
"grad_norm": 1.2308496236801147, |
|
"learning_rate": 2.771263052802624e-05, |
|
"loss": 0.6473, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.09381237524950099, |
|
"grad_norm": 1.2072350978851318, |
|
"learning_rate": 2.7868297632261957e-05, |
|
"loss": 0.6273, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.09580838323353294, |
|
"grad_norm": 1.150260090827942, |
|
"learning_rate": 2.8020687289593123e-05, |
|
"loss": 0.618, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09780439121756487, |
|
"grad_norm": 1.1447213888168335, |
|
"learning_rate": 2.8169934667141895e-05, |
|
"loss": 0.625, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.0998003992015968, |
|
"grad_norm": 1.1371378898620605, |
|
"learning_rate": 2.8316166738933646e-05, |
|
"loss": 0.6372, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.10179640718562874, |
|
"grad_norm": 1.1135759353637695, |
|
"learning_rate": 2.845950293496561e-05, |
|
"loss": 0.539, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.10379241516966067, |
|
"grad_norm": 0.9502639174461365, |
|
"learning_rate": 2.8600055727246657e-05, |
|
"loss": 0.388, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.10578842315369262, |
|
"grad_norm": 1.545538306236267, |
|
"learning_rate": 2.8737931160013153e-05, |
|
"loss": 0.5401, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.10778443113772455, |
|
"grad_norm": 1.223322868347168, |
|
"learning_rate": 2.8873229330382812e-05, |
|
"loss": 0.5695, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.10978043912175649, |
|
"grad_norm": 1.0864529609680176, |
|
"learning_rate": 2.9006044824904066e-05, |
|
"loss": 0.4986, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.11177644710578842, |
|
"grad_norm": 1.1569509506225586, |
|
"learning_rate": 2.913646711677001e-05, |
|
"loss": 0.5629, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.11377245508982035, |
|
"grad_norm": 1.3813297748565674, |
|
"learning_rate": 2.926458092787486e-05, |
|
"loss": 0.605, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.1157684630738523, |
|
"grad_norm": 1.034891128540039, |
|
"learning_rate": 2.939046655938229e-05, |
|
"loss": 0.5247, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.11776447105788423, |
|
"grad_norm": 1.0968964099884033, |
|
"learning_rate": 2.951420019403574e-05, |
|
"loss": 0.5797, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 1.0885212421417236, |
|
"learning_rate": 2.963585417306073e-05, |
|
"loss": 0.5689, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1217564870259481, |
|
"grad_norm": 1.2548822164535522, |
|
"learning_rate": 2.9755497250179453e-05, |
|
"loss": 0.5559, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.12375249500998003, |
|
"grad_norm": 1.009814739227295, |
|
"learning_rate": 2.98731948249709e-05, |
|
"loss": 0.4973, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.12574850299401197, |
|
"grad_norm": 1.0727399587631226, |
|
"learning_rate": 2.9989009157559694e-05, |
|
"loss": 0.5439, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.1277445109780439, |
|
"grad_norm": 1.1233041286468506, |
|
"learning_rate": 3.010299956639812e-05, |
|
"loss": 0.5472, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12974051896207583, |
|
"grad_norm": 1.1565264463424683, |
|
"learning_rate": 3.021522261071426e-05, |
|
"loss": 0.6008, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1317365269461078, |
|
"grad_norm": 0.9942654371261597, |
|
"learning_rate": 3.0325732259031143e-05, |
|
"loss": 0.4552, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.13373253493013973, |
|
"grad_norm": 1.100710153579712, |
|
"learning_rate": 3.043458004501377e-05, |
|
"loss": 0.4661, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.13572854291417166, |
|
"grad_norm": 1.0481464862823486, |
|
"learning_rate": 3.054181521177061e-05, |
|
"loss": 0.4996, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1377245508982036, |
|
"grad_norm": 1.085190773010254, |
|
"learning_rate": 3.064748484562093e-05, |
|
"loss": 0.5589, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.13972055888223553, |
|
"grad_norm": 1.0909191370010376, |
|
"learning_rate": 3.0751634000237615e-05, |
|
"loss": 0.5948, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.14171656686626746, |
|
"grad_norm": 1.9369421005249023, |
|
"learning_rate": 3.085430581198459e-05, |
|
"loss": 0.5384, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1437125748502994, |
|
"grad_norm": 1.1248409748077393, |
|
"learning_rate": 3.095554160718781e-05, |
|
"loss": 0.4915, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.14570858283433133, |
|
"grad_norm": 1.028275728225708, |
|
"learning_rate": 3.10553810020076e-05, |
|
"loss": 0.5405, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.14770459081836326, |
|
"grad_norm": 0.9245263338088989, |
|
"learning_rate": 3.115386199551628e-05, |
|
"loss": 0.4313, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1497005988023952, |
|
"grad_norm": 1.0587871074676514, |
|
"learning_rate": 3.1251021056528336e-05, |
|
"loss": 0.5154, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.15169660678642716, |
|
"grad_norm": 1.0819029808044434, |
|
"learning_rate": 3.134689320467986e-05, |
|
"loss": 0.5097, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1536926147704591, |
|
"grad_norm": 1.0212074518203735, |
|
"learning_rate": 3.144151208620804e-05, |
|
"loss": 0.4365, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.15568862275449102, |
|
"grad_norm": 1.140681266784668, |
|
"learning_rate": 3.1534910044841344e-05, |
|
"loss": 0.5734, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.15768463073852296, |
|
"grad_norm": 1.0276720523834229, |
|
"learning_rate": 3.1627118188174024e-05, |
|
"loss": 0.42, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.1596806387225549, |
|
"grad_norm": 0.980180025100708, |
|
"learning_rate": 3.171816644986573e-05, |
|
"loss": 0.4796, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.16167664670658682, |
|
"grad_norm": 1.198864221572876, |
|
"learning_rate": 3.18080836479775e-05, |
|
"loss": 0.5675, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.16367265469061876, |
|
"grad_norm": 0.9353108406066895, |
|
"learning_rate": 3.1896897539728616e-05, |
|
"loss": 0.5183, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.1656686626746507, |
|
"grad_norm": 0.9708541035652161, |
|
"learning_rate": 3.198463487293457e-05, |
|
"loss": 0.4513, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.16766467065868262, |
|
"grad_norm": 1.1432932615280151, |
|
"learning_rate": 3.207132143436469e-05, |
|
"loss": 0.589, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.16966067864271456, |
|
"grad_norm": 1.0964723825454712, |
|
"learning_rate": 3.215698209523821e-05, |
|
"loss": 0.5101, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.17165668662674652, |
|
"grad_norm": 1.0808310508728027, |
|
"learning_rate": 3.224164085405946e-05, |
|
"loss": 0.4349, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.17365269461077845, |
|
"grad_norm": 1.0994106531143188, |
|
"learning_rate": 3.232532087697698e-05, |
|
"loss": 0.4965, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.17564870259481039, |
|
"grad_norm": 1.2377325296401978, |
|
"learning_rate": 3.240804453583615e-05, |
|
"loss": 0.4444, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.17764471057884232, |
|
"grad_norm": 1.0575945377349854, |
|
"learning_rate": 3.248983344408188e-05, |
|
"loss": 0.4379, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.17964071856287425, |
|
"grad_norm": 0.8877758979797363, |
|
"learning_rate": 3.2570708490655414e-05, |
|
"loss": 0.453, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.18163672654690619, |
|
"grad_norm": 1.0481340885162354, |
|
"learning_rate": 3.265068987201822e-05, |
|
"loss": 0.519, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.18363273453093812, |
|
"grad_norm": 1.026150107383728, |
|
"learning_rate": 3.2729797122425925e-05, |
|
"loss": 0.5112, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.18562874251497005, |
|
"grad_norm": 0.8472252488136292, |
|
"learning_rate": 3.280804914256559e-05, |
|
"loss": 0.4302, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.18762475049900199, |
|
"grad_norm": 0.9228626489639282, |
|
"learning_rate": 3.288546422666164e-05, |
|
"loss": 0.4814, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.18962075848303392, |
|
"grad_norm": 1.0165542364120483, |
|
"learning_rate": 3.2962060088147464e-05, |
|
"loss": 0.5035, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.19161676646706588, |
|
"grad_norm": 1.091426134109497, |
|
"learning_rate": 3.3037853883992805e-05, |
|
"loss": 0.5718, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1936127744510978, |
|
"grad_norm": 1.0953468084335327, |
|
"learning_rate": 3.3112862237770756e-05, |
|
"loss": 0.5522, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.19560878243512975, |
|
"grad_norm": 0.9461252689361572, |
|
"learning_rate": 3.3187101261541584e-05, |
|
"loss": 0.5257, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.19760479041916168, |
|
"grad_norm": 1.063242793083191, |
|
"learning_rate": 3.326058657662584e-05, |
|
"loss": 0.511, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.1996007984031936, |
|
"grad_norm": 1.0084831714630127, |
|
"learning_rate": 3.333333333333334e-05, |
|
"loss": 0.5182, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.20159680638722555, |
|
"grad_norm": 0.9839895963668823, |
|
"learning_rate": 3.340535622971072e-05, |
|
"loss": 0.4776, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.20359281437125748, |
|
"grad_norm": 0.9757642149925232, |
|
"learning_rate": 3.3476669529365295e-05, |
|
"loss": 0.4915, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2055888223552894, |
|
"grad_norm": 0.8425347208976746, |
|
"learning_rate": 3.3547287078419544e-05, |
|
"loss": 0.3955, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.20758483033932135, |
|
"grad_norm": 0.9176936745643616, |
|
"learning_rate": 3.361722232164634e-05, |
|
"loss": 0.4132, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.20958083832335328, |
|
"grad_norm": 1.0560258626937866, |
|
"learning_rate": 3.3686488317832306e-05, |
|
"loss": 0.5133, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.21157684630738524, |
|
"grad_norm": 0.9101148247718811, |
|
"learning_rate": 3.375509775441284e-05, |
|
"loss": 0.3898, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.21357285429141717, |
|
"grad_norm": 0.8682689666748047, |
|
"learning_rate": 3.382306296142016e-05, |
|
"loss": 0.4353, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2155688622754491, |
|
"grad_norm": 0.8694739937782288, |
|
"learning_rate": 3.38903959247825e-05, |
|
"loss": 0.5008, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.21756487025948104, |
|
"grad_norm": 0.8936677575111389, |
|
"learning_rate": 3.395710829901039e-05, |
|
"loss": 0.4203, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.21956087824351297, |
|
"grad_norm": 0.936951220035553, |
|
"learning_rate": 3.402321141930376e-05, |
|
"loss": 0.4798, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2215568862275449, |
|
"grad_norm": 0.8947778344154358, |
|
"learning_rate": 3.4088716313110955e-05, |
|
"loss": 0.4855, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.22355289421157684, |
|
"grad_norm": 0.8714671730995178, |
|
"learning_rate": 3.415363371116969e-05, |
|
"loss": 0.4973, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.22554890219560877, |
|
"grad_norm": 0.8940010070800781, |
|
"learning_rate": 3.4217974058057e-05, |
|
"loss": 0.5217, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2275449101796407, |
|
"grad_norm": 0.8057599663734436, |
|
"learning_rate": 3.428174752227455e-05, |
|
"loss": 0.3906, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.22954091816367264, |
|
"grad_norm": 1.0616763830184937, |
|
"learning_rate": 3.434496400589353e-05, |
|
"loss": 0.4958, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2315369261477046, |
|
"grad_norm": 0.8679594993591309, |
|
"learning_rate": 3.440763315378198e-05, |
|
"loss": 0.4526, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.23353293413173654, |
|
"grad_norm": 0.8972085118293762, |
|
"learning_rate": 3.446976436243603e-05, |
|
"loss": 0.4559, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.23552894211576847, |
|
"grad_norm": 0.9083353877067566, |
|
"learning_rate": 3.4531366788435425e-05, |
|
"loss": 0.5048, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2375249500998004, |
|
"grad_norm": 0.8607695698738098, |
|
"learning_rate": 3.459244935654219e-05, |
|
"loss": 0.4128, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 0.8851041793823242, |
|
"learning_rate": 3.465302076746041e-05, |
|
"loss": 0.4602, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.24151696606786427, |
|
"grad_norm": 0.9059931039810181, |
|
"learning_rate": 3.471308950527417e-05, |
|
"loss": 0.4791, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.2435129740518962, |
|
"grad_norm": 0.9063411951065063, |
|
"learning_rate": 3.477266384457914e-05, |
|
"loss": 0.4741, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.24550898203592814, |
|
"grad_norm": 0.8850985765457153, |
|
"learning_rate": 3.48317518573233e-05, |
|
"loss": 0.4292, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.24750499001996007, |
|
"grad_norm": 0.9396518468856812, |
|
"learning_rate": 3.489036141937059e-05, |
|
"loss": 0.5069, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.249500998003992, |
|
"grad_norm": 0.9115111231803894, |
|
"learning_rate": 3.494850021680094e-05, |
|
"loss": 0.4823, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.25149700598802394, |
|
"grad_norm": 0.8799051642417908, |
|
"learning_rate": 3.500617575195938e-05, |
|
"loss": 0.3732, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.25349301397205587, |
|
"grad_norm": 0.9273744821548462, |
|
"learning_rate": 3.5063395349265945e-05, |
|
"loss": 0.4284, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.2554890219560878, |
|
"grad_norm": 1.0624243021011353, |
|
"learning_rate": 3.5120166160797804e-05, |
|
"loss": 0.4322, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.25748502994011974, |
|
"grad_norm": 0.8508513569831848, |
|
"learning_rate": 3.517649517165415e-05, |
|
"loss": 0.4465, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.25948103792415167, |
|
"grad_norm": 0.8986352682113647, |
|
"learning_rate": 3.523238920511395e-05, |
|
"loss": 0.4093, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.26147704590818366, |
|
"grad_norm": 0.9224410653114319, |
|
"learning_rate": 3.528785492759607e-05, |
|
"loss": 0.4735, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2634730538922156, |
|
"grad_norm": 0.9467160105705261, |
|
"learning_rate": 3.5342898853430836e-05, |
|
"loss": 0.4952, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.2654690618762475, |
|
"grad_norm": 0.9140215516090393, |
|
"learning_rate": 3.539752734945143e-05, |
|
"loss": 0.4516, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.26746506986027946, |
|
"grad_norm": 0.9906129837036133, |
|
"learning_rate": 3.5451746639413466e-05, |
|
"loss": 0.3785, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2694610778443114, |
|
"grad_norm": 0.8118170499801636, |
|
"learning_rate": 3.550556280825011e-05, |
|
"loss": 0.4324, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.2714570858283433, |
|
"grad_norm": 0.9162650108337402, |
|
"learning_rate": 3.55589818061703e-05, |
|
"loss": 0.3836, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.27345309381237526, |
|
"grad_norm": 0.8672250509262085, |
|
"learning_rate": 3.561200945260678e-05, |
|
"loss": 0.4462, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.2754491017964072, |
|
"grad_norm": 0.906155526638031, |
|
"learning_rate": 3.5664651440020616e-05, |
|
"loss": 0.4749, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.2774451097804391, |
|
"grad_norm": 0.9452763199806213, |
|
"learning_rate": 3.571691333756825e-05, |
|
"loss": 0.4782, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.27944111776447106, |
|
"grad_norm": 0.8917446136474609, |
|
"learning_rate": 3.5768800594637304e-05, |
|
"loss": 0.4401, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.281437125748503, |
|
"grad_norm": 0.882606029510498, |
|
"learning_rate": 3.582031854425634e-05, |
|
"loss": 0.4992, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2834331337325349, |
|
"grad_norm": 0.870290219783783, |
|
"learning_rate": 3.587147240638428e-05, |
|
"loss": 0.5009, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.28542914171656686, |
|
"grad_norm": 0.8788816332817078, |
|
"learning_rate": 3.5922267291084366e-05, |
|
"loss": 0.3891, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.2874251497005988, |
|
"grad_norm": 0.8944652676582336, |
|
"learning_rate": 3.5972708201587496e-05, |
|
"loss": 0.442, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.2894211576846307, |
|
"grad_norm": 0.8970728516578674, |
|
"learning_rate": 3.6022800037249585e-05, |
|
"loss": 0.5065, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.29141716566866266, |
|
"grad_norm": 0.9061855673789978, |
|
"learning_rate": 3.607254759640729e-05, |
|
"loss": 0.4617, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2934131736526946, |
|
"grad_norm": 0.851344883441925, |
|
"learning_rate": 3.612195557913627e-05, |
|
"loss": 0.4633, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.2954091816367265, |
|
"grad_norm": 0.8392930626869202, |
|
"learning_rate": 3.6171028589915954e-05, |
|
"loss": 0.434, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.29740518962075846, |
|
"grad_norm": 0.8495596051216125, |
|
"learning_rate": 3.6219771140204575e-05, |
|
"loss": 0.4627, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.2994011976047904, |
|
"grad_norm": 0.8151164650917053, |
|
"learning_rate": 3.626818765092802e-05, |
|
"loss": 0.4152, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3013972055888224, |
|
"grad_norm": 0.9488523602485657, |
|
"learning_rate": 3.6316282454886157e-05, |
|
"loss": 0.4912, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.3033932135728543, |
|
"grad_norm": 0.6952376365661621, |
|
"learning_rate": 3.636405979907955e-05, |
|
"loss": 0.3153, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.30538922155688625, |
|
"grad_norm": 0.8647618293762207, |
|
"learning_rate": 3.6411523846959985e-05, |
|
"loss": 0.4619, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3073852295409182, |
|
"grad_norm": 0.8178197741508484, |
|
"learning_rate": 3.645867868060772e-05, |
|
"loss": 0.5165, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3093812375249501, |
|
"grad_norm": 0.8717004060745239, |
|
"learning_rate": 3.6505528302838193e-05, |
|
"loss": 0.4222, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.31137724550898205, |
|
"grad_norm": 0.867859423160553, |
|
"learning_rate": 3.6552076639241027e-05, |
|
"loss": 0.4882, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.313373253493014, |
|
"grad_norm": 0.8131747841835022, |
|
"learning_rate": 3.65983275401539e-05, |
|
"loss": 0.4171, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3153692614770459, |
|
"grad_norm": 0.8518748879432678, |
|
"learning_rate": 3.664428478257371e-05, |
|
"loss": 0.4342, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.31736526946107785, |
|
"grad_norm": 0.8354836702346802, |
|
"learning_rate": 3.668995207200753e-05, |
|
"loss": 0.4698, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3193612774451098, |
|
"grad_norm": 0.9375539422035217, |
|
"learning_rate": 3.673533304426541e-05, |
|
"loss": 0.4896, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3213572854291417, |
|
"grad_norm": 0.8951889872550964, |
|
"learning_rate": 3.67804312671975e-05, |
|
"loss": 0.4997, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.32335329341317365, |
|
"grad_norm": 0.8014180064201355, |
|
"learning_rate": 3.682525024237719e-05, |
|
"loss": 0.47, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3253493013972056, |
|
"grad_norm": 0.8288528323173523, |
|
"learning_rate": 3.6869793406732636e-05, |
|
"loss": 0.4085, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.3273453093812375, |
|
"grad_norm": 0.8221442699432373, |
|
"learning_rate": 3.69140641341283e-05, |
|
"loss": 0.4329, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.32934131736526945, |
|
"grad_norm": 0.7562230825424194, |
|
"learning_rate": 3.695806573689844e-05, |
|
"loss": 0.348, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.3313373253493014, |
|
"grad_norm": 0.9237514138221741, |
|
"learning_rate": 3.700180146733426e-05, |
|
"loss": 0.4342, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.9912509918212891, |
|
"learning_rate": 3.704527451912639e-05, |
|
"loss": 0.4809, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.33532934131736525, |
|
"grad_norm": 0.8607332110404968, |
|
"learning_rate": 3.708848802876438e-05, |
|
"loss": 0.4586, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3373253493013972, |
|
"grad_norm": 0.8513320684432983, |
|
"learning_rate": 3.7131445076894564e-05, |
|
"loss": 0.4471, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3393213572854291, |
|
"grad_norm": 0.8249276876449585, |
|
"learning_rate": 3.717414868963791e-05, |
|
"loss": 0.3867, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3413173652694611, |
|
"grad_norm": 0.9111822843551636, |
|
"learning_rate": 3.721660183986924e-05, |
|
"loss": 0.4502, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.34331337325349304, |
|
"grad_norm": 0.7368887066841125, |
|
"learning_rate": 3.725880744845915e-05, |
|
"loss": 0.3507, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.34530938123752497, |
|
"grad_norm": 0.8149043917655945, |
|
"learning_rate": 3.730076838547993e-05, |
|
"loss": 0.3512, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.3473053892215569, |
|
"grad_norm": 0.9387815594673157, |
|
"learning_rate": 3.734248747137666e-05, |
|
"loss": 0.4263, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.34930139720558884, |
|
"grad_norm": 0.860095202922821, |
|
"learning_rate": 3.738396747810492e-05, |
|
"loss": 0.3923, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.35129740518962077, |
|
"grad_norm": 0.7961985468864441, |
|
"learning_rate": 3.7425211130235834e-05, |
|
"loss": 0.3811, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3532934131736527, |
|
"grad_norm": 0.8483626842498779, |
|
"learning_rate": 3.7466221106030115e-05, |
|
"loss": 0.4152, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.35528942115768464, |
|
"grad_norm": 0.7593052387237549, |
|
"learning_rate": 3.750700003848157e-05, |
|
"loss": 0.3321, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.35728542914171657, |
|
"grad_norm": 0.8519895672798157, |
|
"learning_rate": 3.7547550516331555e-05, |
|
"loss": 0.4008, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 0.8514580726623535, |
|
"learning_rate": 3.75878750850551e-05, |
|
"loss": 0.408, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.36127744510978044, |
|
"grad_norm": 0.8409926891326904, |
|
"learning_rate": 3.7627976247819744e-05, |
|
"loss": 0.4076, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.36327345309381237, |
|
"grad_norm": 0.7313259840011597, |
|
"learning_rate": 3.766785646641792e-05, |
|
"loss": 0.4311, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.3652694610778443, |
|
"grad_norm": 0.7503537535667419, |
|
"learning_rate": 3.770751816217383e-05, |
|
"loss": 0.422, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.36726546906187624, |
|
"grad_norm": 0.7808623313903809, |
|
"learning_rate": 3.7746963716825615e-05, |
|
"loss": 0.4475, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.36926147704590817, |
|
"grad_norm": 0.6921509504318237, |
|
"learning_rate": 3.778619547338356e-05, |
|
"loss": 0.3981, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.3712574850299401, |
|
"grad_norm": 0.7929064035415649, |
|
"learning_rate": 3.782521573696528e-05, |
|
"loss": 0.4482, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.37325349301397204, |
|
"grad_norm": 0.7118304371833801, |
|
"learning_rate": 3.786402677560832e-05, |
|
"loss": 0.3413, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.37524950099800397, |
|
"grad_norm": 0.7609389424324036, |
|
"learning_rate": 3.790263082106134e-05, |
|
"loss": 0.4207, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3772455089820359, |
|
"grad_norm": 0.8060720562934875, |
|
"learning_rate": 3.794103006955407e-05, |
|
"loss": 0.4521, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.37924151696606784, |
|
"grad_norm": 0.8100878596305847, |
|
"learning_rate": 3.797922668254715e-05, |
|
"loss": 0.3653, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3812375249500998, |
|
"grad_norm": 0.8395611047744751, |
|
"learning_rate": 3.801722278746213e-05, |
|
"loss": 0.3662, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.38323353293413176, |
|
"grad_norm": 0.7541958093643188, |
|
"learning_rate": 3.8055020478392495e-05, |
|
"loss": 0.2939, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3852295409181637, |
|
"grad_norm": 0.8053567409515381, |
|
"learning_rate": 3.809262181679623e-05, |
|
"loss": 0.4302, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.3872255489021956, |
|
"grad_norm": 0.8586562275886536, |
|
"learning_rate": 3.813002883217044e-05, |
|
"loss": 0.3984, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.38922155688622756, |
|
"grad_norm": 0.7566971778869629, |
|
"learning_rate": 3.816724352270863e-05, |
|
"loss": 0.3839, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.3912175648702595, |
|
"grad_norm": 0.8142690658569336, |
|
"learning_rate": 3.8204267855941266e-05, |
|
"loss": 0.4014, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3932135728542914, |
|
"grad_norm": 0.7769673466682434, |
|
"learning_rate": 3.824110376935989e-05, |
|
"loss": 0.3791, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.39520958083832336, |
|
"grad_norm": 0.8861010670661926, |
|
"learning_rate": 3.827775317102552e-05, |
|
"loss": 0.3904, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.3972055888223553, |
|
"grad_norm": 0.7677756547927856, |
|
"learning_rate": 3.831421794016178e-05, |
|
"loss": 0.3933, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3992015968063872, |
|
"grad_norm": 0.9913503527641296, |
|
"learning_rate": 3.835049992773302e-05, |
|
"loss": 0.439, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.40119760479041916, |
|
"grad_norm": 0.847613513469696, |
|
"learning_rate": 3.838660095700815e-05, |
|
"loss": 0.4462, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4031936127744511, |
|
"grad_norm": 0.8353721499443054, |
|
"learning_rate": 3.84225228241104e-05, |
|
"loss": 0.4621, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.405189620758483, |
|
"grad_norm": 0.809059739112854, |
|
"learning_rate": 3.8458267298553554e-05, |
|
"loss": 0.4888, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.40718562874251496, |
|
"grad_norm": 0.7496485710144043, |
|
"learning_rate": 3.8493836123764984e-05, |
|
"loss": 0.3836, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.4091816367265469, |
|
"grad_norm": 0.9037646651268005, |
|
"learning_rate": 3.852923101759591e-05, |
|
"loss": 0.3993, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.4111776447105788, |
|
"grad_norm": 0.8741063475608826, |
|
"learning_rate": 3.856445367281923e-05, |
|
"loss": 0.3948, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.41317365269461076, |
|
"grad_norm": 0.8445413112640381, |
|
"learning_rate": 3.859950575761529e-05, |
|
"loss": 0.4305, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.4151696606786427, |
|
"grad_norm": 0.9107454419136047, |
|
"learning_rate": 3.8634388916046025e-05, |
|
"loss": 0.4982, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.4171656686626746, |
|
"grad_norm": 0.7765053510665894, |
|
"learning_rate": 3.866910476851757e-05, |
|
"loss": 0.4147, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.41916167664670656, |
|
"grad_norm": 0.8410398364067078, |
|
"learning_rate": 3.870365491223199e-05, |
|
"loss": 0.4125, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.42115768463073855, |
|
"grad_norm": 0.8012726306915283, |
|
"learning_rate": 3.8738040921628215e-05, |
|
"loss": 0.3941, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.4231536926147705, |
|
"grad_norm": 0.8541998863220215, |
|
"learning_rate": 3.877226434881253e-05, |
|
"loss": 0.4327, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.4251497005988024, |
|
"grad_norm": 0.8243539929389954, |
|
"learning_rate": 3.880632672397897e-05, |
|
"loss": 0.4303, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.42714570858283435, |
|
"grad_norm": 0.8121338486671448, |
|
"learning_rate": 3.884022955581985e-05, |
|
"loss": 0.4301, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.4291417165668663, |
|
"grad_norm": 0.9100980758666992, |
|
"learning_rate": 3.887397433192676e-05, |
|
"loss": 0.4208, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.4311377245508982, |
|
"grad_norm": 0.748666524887085, |
|
"learning_rate": 3.890756251918219e-05, |
|
"loss": 0.3384, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.43313373253493015, |
|
"grad_norm": 0.758114755153656, |
|
"learning_rate": 3.894099556414216e-05, |
|
"loss": 0.3797, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.4351297405189621, |
|
"grad_norm": 0.8046779632568359, |
|
"learning_rate": 3.897427489341009e-05, |
|
"loss": 0.4325, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.437125748502994, |
|
"grad_norm": 0.872130274772644, |
|
"learning_rate": 3.900740191400198e-05, |
|
"loss": 0.4466, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.43912175648702595, |
|
"grad_norm": 0.8052610158920288, |
|
"learning_rate": 3.904037801370344e-05, |
|
"loss": 0.4355, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4411177644710579, |
|
"grad_norm": 0.7204791903495789, |
|
"learning_rate": 3.9073204561418514e-05, |
|
"loss": 0.3465, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.4431137724550898, |
|
"grad_norm": 0.7979363799095154, |
|
"learning_rate": 3.9105882907510644e-05, |
|
"loss": 0.4004, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.44510978043912175, |
|
"grad_norm": 0.7269802093505859, |
|
"learning_rate": 3.913841438413601e-05, |
|
"loss": 0.4261, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.4471057884231537, |
|
"grad_norm": 0.6730761528015137, |
|
"learning_rate": 3.917080030556938e-05, |
|
"loss": 0.3192, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4491017964071856, |
|
"grad_norm": 0.8741471767425537, |
|
"learning_rate": 3.9203041968522716e-05, |
|
"loss": 0.4663, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.45109780439121755, |
|
"grad_norm": 0.8293672800064087, |
|
"learning_rate": 3.923514065245669e-05, |
|
"loss": 0.4558, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4530938123752495, |
|
"grad_norm": 0.7904106378555298, |
|
"learning_rate": 3.926709761988538e-05, |
|
"loss": 0.4546, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4550898203592814, |
|
"grad_norm": 0.7640888094902039, |
|
"learning_rate": 3.929891411667424e-05, |
|
"loss": 0.3762, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.45708582834331335, |
|
"grad_norm": 0.776006281375885, |
|
"learning_rate": 3.933059137233147e-05, |
|
"loss": 0.4447, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.4590818363273453, |
|
"grad_norm": 0.8613069653511047, |
|
"learning_rate": 3.9362130600293214e-05, |
|
"loss": 0.4366, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.46107784431137727, |
|
"grad_norm": 0.7828835248947144, |
|
"learning_rate": 3.9393532998202405e-05, |
|
"loss": 0.4434, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.4630738522954092, |
|
"grad_norm": 0.7422530055046082, |
|
"learning_rate": 3.942479974818166e-05, |
|
"loss": 0.3755, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.46506986027944114, |
|
"grad_norm": 0.7256511449813843, |
|
"learning_rate": 3.945593201710032e-05, |
|
"loss": 0.375, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.46706586826347307, |
|
"grad_norm": 0.7594771385192871, |
|
"learning_rate": 3.9486930956835724e-05, |
|
"loss": 0.3985, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.469061876247505, |
|
"grad_norm": 0.7957077622413635, |
|
"learning_rate": 3.951779770452894e-05, |
|
"loss": 0.421, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.47105788423153694, |
|
"grad_norm": 0.7573441863059998, |
|
"learning_rate": 3.954853338283512e-05, |
|
"loss": 0.4592, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.47305389221556887, |
|
"grad_norm": 0.7109091877937317, |
|
"learning_rate": 3.9579139100168404e-05, |
|
"loss": 0.3732, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4750499001996008, |
|
"grad_norm": 0.8672693371772766, |
|
"learning_rate": 3.960961595094187e-05, |
|
"loss": 0.4038, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.47704590818363274, |
|
"grad_norm": 0.7573640942573547, |
|
"learning_rate": 3.96399650158023e-05, |
|
"loss": 0.4348, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 0.8784688711166382, |
|
"learning_rate": 3.96701873618601e-05, |
|
"loss": 0.4704, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4810379241516966, |
|
"grad_norm": 0.8110889792442322, |
|
"learning_rate": 3.970028404291448e-05, |
|
"loss": 0.381, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.48303393213572854, |
|
"grad_norm": 0.8944825530052185, |
|
"learning_rate": 3.9730256099673865e-05, |
|
"loss": 0.3282, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.48502994011976047, |
|
"grad_norm": 0.8505921959877014, |
|
"learning_rate": 3.976010455997187e-05, |
|
"loss": 0.3794, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4870259481037924, |
|
"grad_norm": 1.0878411531448364, |
|
"learning_rate": 3.978983043897883e-05, |
|
"loss": 0.4222, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.48902195608782434, |
|
"grad_norm": 0.7262081503868103, |
|
"learning_rate": 3.981943473940888e-05, |
|
"loss": 0.3682, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.49101796407185627, |
|
"grad_norm": 1.0304243564605713, |
|
"learning_rate": 3.984891845172299e-05, |
|
"loss": 0.3546, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.4930139720558882, |
|
"grad_norm": 0.7483956217765808, |
|
"learning_rate": 3.987828255432777e-05, |
|
"loss": 0.3764, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.49500998003992014, |
|
"grad_norm": 1.969207525253296, |
|
"learning_rate": 3.9907528013770276e-05, |
|
"loss": 0.4436, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.49700598802395207, |
|
"grad_norm": 0.836520254611969, |
|
"learning_rate": 3.993665578492894e-05, |
|
"loss": 0.4477, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.499001996007984, |
|
"grad_norm": 0.8878058791160583, |
|
"learning_rate": 3.9965666811200624e-05, |
|
"loss": 0.355, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.500998003992016, |
|
"grad_norm": 0.7905710935592651, |
|
"learning_rate": 3.999456202468397e-05, |
|
"loss": 0.4044, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.5029940119760479, |
|
"grad_norm": 0.7035382390022278, |
|
"learning_rate": 4.002334234635907e-05, |
|
"loss": 0.3515, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.5049900199600799, |
|
"grad_norm": 1.702528476715088, |
|
"learning_rate": 4.005200868626364e-05, |
|
"loss": 0.4055, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.5069860279441117, |
|
"grad_norm": 0.7991278171539307, |
|
"learning_rate": 4.008056194366564e-05, |
|
"loss": 0.4327, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.5089820359281437, |
|
"grad_norm": 0.803960382938385, |
|
"learning_rate": 4.010900300723259e-05, |
|
"loss": 0.4187, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.5109780439121756, |
|
"grad_norm": 0.7045860886573792, |
|
"learning_rate": 4.013733275519749e-05, |
|
"loss": 0.3947, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.5129740518962076, |
|
"grad_norm": 0.7627609372138977, |
|
"learning_rate": 4.016555205552158e-05, |
|
"loss": 0.3808, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.5149700598802395, |
|
"grad_norm": 0.7807031869888306, |
|
"learning_rate": 4.0193661766053834e-05, |
|
"loss": 0.4408, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.5169660678642715, |
|
"grad_norm": 0.7607232332229614, |
|
"learning_rate": 4.022166273468753e-05, |
|
"loss": 0.3826, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.5189620758483033, |
|
"grad_norm": 0.738200306892395, |
|
"learning_rate": 4.024955579951363e-05, |
|
"loss": 0.3403, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5209580838323353, |
|
"grad_norm": 0.7401778101921082, |
|
"learning_rate": 4.027734178897136e-05, |
|
"loss": 0.3927, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.5229540918163673, |
|
"grad_norm": 0.8561487793922424, |
|
"learning_rate": 4.030502152199576e-05, |
|
"loss": 0.4247, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.5249500998003992, |
|
"grad_norm": 0.7845680117607117, |
|
"learning_rate": 4.033259580816264e-05, |
|
"loss": 0.4284, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.5269461077844312, |
|
"grad_norm": 0.8121227622032166, |
|
"learning_rate": 4.036006544783052e-05, |
|
"loss": 0.4534, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.5289421157684631, |
|
"grad_norm": 0.7015953660011292, |
|
"learning_rate": 4.0387431232280135e-05, |
|
"loss": 0.3404, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.530938123752495, |
|
"grad_norm": 0.7971146702766418, |
|
"learning_rate": 4.041469394385112e-05, |
|
"loss": 0.4455, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.5329341317365269, |
|
"grad_norm": 0.7655112147331238, |
|
"learning_rate": 4.0441854356076257e-05, |
|
"loss": 0.4636, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.5349301397205589, |
|
"grad_norm": 0.8320984840393066, |
|
"learning_rate": 4.046891323381315e-05, |
|
"loss": 0.3777, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5369261477045908, |
|
"grad_norm": 0.9041264057159424, |
|
"learning_rate": 4.049587133337347e-05, |
|
"loss": 0.4006, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.5389221556886228, |
|
"grad_norm": 0.8236355185508728, |
|
"learning_rate": 4.0522729402649793e-05, |
|
"loss": 0.418, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5409181636726547, |
|
"grad_norm": 0.9298795461654663, |
|
"learning_rate": 4.0549488181240096e-05, |
|
"loss": 0.3358, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.5429141716566867, |
|
"grad_norm": 0.7561654448509216, |
|
"learning_rate": 4.057614840056998e-05, |
|
"loss": 0.4008, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5449101796407185, |
|
"grad_norm": 0.7712647318840027, |
|
"learning_rate": 4.06027107840126e-05, |
|
"loss": 0.3607, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.5469061876247505, |
|
"grad_norm": 0.7622309327125549, |
|
"learning_rate": 4.0629176047006474e-05, |
|
"loss": 0.3567, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5489021956087824, |
|
"grad_norm": 0.7064681649208069, |
|
"learning_rate": 4.065554489717105e-05, |
|
"loss": 0.3528, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.5508982035928144, |
|
"grad_norm": 0.8189475536346436, |
|
"learning_rate": 4.068181803442029e-05, |
|
"loss": 0.4062, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5528942115768463, |
|
"grad_norm": 0.8143854737281799, |
|
"learning_rate": 4.0707996151074147e-05, |
|
"loss": 0.4374, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.5548902195608783, |
|
"grad_norm": 0.7282266616821289, |
|
"learning_rate": 4.073407993196794e-05, |
|
"loss": 0.4121, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5568862275449101, |
|
"grad_norm": 0.7541894316673279, |
|
"learning_rate": 4.076007005455996e-05, |
|
"loss": 0.4702, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.5588822355289421, |
|
"grad_norm": 0.7178213596343994, |
|
"learning_rate": 4.0785967189036986e-05, |
|
"loss": 0.3581, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5608782435129741, |
|
"grad_norm": 0.8269951343536377, |
|
"learning_rate": 4.0811771998418e-05, |
|
"loss": 0.414, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.562874251497006, |
|
"grad_norm": 0.6949253082275391, |
|
"learning_rate": 4.083748513865602e-05, |
|
"loss": 0.3549, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.564870259481038, |
|
"grad_norm": 0.8457996845245361, |
|
"learning_rate": 4.086310725873818e-05, |
|
"loss": 0.4977, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5668662674650699, |
|
"grad_norm": 0.835884690284729, |
|
"learning_rate": 4.0888639000783966e-05, |
|
"loss": 0.4646, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5688622754491018, |
|
"grad_norm": 0.745847225189209, |
|
"learning_rate": 4.0914081000141844e-05, |
|
"loss": 0.4295, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5708582834331337, |
|
"grad_norm": 0.703731119632721, |
|
"learning_rate": 4.0939433885484055e-05, |
|
"loss": 0.3168, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5728542914171657, |
|
"grad_norm": 0.6979167461395264, |
|
"learning_rate": 4.0964698278899874e-05, |
|
"loss": 0.3373, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5748502994011976, |
|
"grad_norm": 0.7321177124977112, |
|
"learning_rate": 4.0989874795987185e-05, |
|
"loss": 0.3705, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5768463073852296, |
|
"grad_norm": 0.6812002658843994, |
|
"learning_rate": 4.1014964045942465e-05, |
|
"loss": 0.366, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5788423153692615, |
|
"grad_norm": 0.8122517466545105, |
|
"learning_rate": 4.103996663164927e-05, |
|
"loss": 0.4435, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5808383233532934, |
|
"grad_norm": 0.7670555710792542, |
|
"learning_rate": 4.106488314976513e-05, |
|
"loss": 0.471, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.5828343313373253, |
|
"grad_norm": 0.7457311749458313, |
|
"learning_rate": 4.108971419080698e-05, |
|
"loss": 0.3138, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5848303393213573, |
|
"grad_norm": 0.8164945244789124, |
|
"learning_rate": 4.111446033923516e-05, |
|
"loss": 0.4394, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5868263473053892, |
|
"grad_norm": 0.7513836622238159, |
|
"learning_rate": 4.113912217353596e-05, |
|
"loss": 0.3741, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5888223552894212, |
|
"grad_norm": 0.7199726700782776, |
|
"learning_rate": 4.116370026630272e-05, |
|
"loss": 0.3116, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.590818363273453, |
|
"grad_norm": 0.8232783675193787, |
|
"learning_rate": 4.118819518431564e-05, |
|
"loss": 0.4048, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.592814371257485, |
|
"grad_norm": 0.7513990998268127, |
|
"learning_rate": 4.121260748862021e-05, |
|
"loss": 0.4346, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5948103792415169, |
|
"grad_norm": 0.6866230368614197, |
|
"learning_rate": 4.123693773460426e-05, |
|
"loss": 0.3629, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5968063872255489, |
|
"grad_norm": 0.7753307223320007, |
|
"learning_rate": 4.126118647207383e-05, |
|
"loss": 0.4248, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 0.6598490476608276, |
|
"learning_rate": 4.1285354245327715e-05, |
|
"loss": 0.2834, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6007984031936128, |
|
"grad_norm": 0.8024352788925171, |
|
"learning_rate": 4.1309441593230726e-05, |
|
"loss": 0.4276, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.6027944111776448, |
|
"grad_norm": 0.7745522260665894, |
|
"learning_rate": 4.133344904928585e-05, |
|
"loss": 0.3925, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.6047904191616766, |
|
"grad_norm": 0.6376944184303284, |
|
"learning_rate": 4.1357377141705084e-05, |
|
"loss": 0.2589, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.6067864271457086, |
|
"grad_norm": 0.6831088662147522, |
|
"learning_rate": 4.1381226393479236e-05, |
|
"loss": 0.3705, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.6087824351297405, |
|
"grad_norm": 0.6832078695297241, |
|
"learning_rate": 4.1404997322446435e-05, |
|
"loss": 0.3637, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.6107784431137725, |
|
"grad_norm": 0.7155686020851135, |
|
"learning_rate": 4.142869044135967e-05, |
|
"loss": 0.477, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.6127744510978044, |
|
"grad_norm": 0.7326770424842834, |
|
"learning_rate": 4.145230625795311e-05, |
|
"loss": 0.4123, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.6147704590818364, |
|
"grad_norm": 0.7184780240058899, |
|
"learning_rate": 4.14758452750074e-05, |
|
"loss": 0.3382, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.6167664670658682, |
|
"grad_norm": 0.7494658827781677, |
|
"learning_rate": 4.149930799041392e-05, |
|
"loss": 0.4246, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.6187624750499002, |
|
"grad_norm": 0.697238028049469, |
|
"learning_rate": 4.152269489723788e-05, |
|
"loss": 0.4338, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6207584830339321, |
|
"grad_norm": 0.6342530846595764, |
|
"learning_rate": 4.1546006483780626e-05, |
|
"loss": 0.3202, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.6227544910179641, |
|
"grad_norm": 0.7153366804122925, |
|
"learning_rate": 4.156924323364072e-05, |
|
"loss": 0.3778, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.624750499001996, |
|
"grad_norm": 0.6666108965873718, |
|
"learning_rate": 4.1592405625774144e-05, |
|
"loss": 0.346, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.626746506986028, |
|
"grad_norm": 0.7076640725135803, |
|
"learning_rate": 4.161549413455358e-05, |
|
"loss": 0.3827, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.6287425149700598, |
|
"grad_norm": 0.7535362839698792, |
|
"learning_rate": 4.163850922982668e-05, |
|
"loss": 0.401, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.6307385229540918, |
|
"grad_norm": 0.6954286098480225, |
|
"learning_rate": 4.16614513769734e-05, |
|
"loss": 0.376, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.6327345309381237, |
|
"grad_norm": 0.6925478577613831, |
|
"learning_rate": 4.1684321036962526e-05, |
|
"loss": 0.3638, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.6347305389221557, |
|
"grad_norm": 0.663144588470459, |
|
"learning_rate": 4.170711866640721e-05, |
|
"loss": 0.3558, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.6367265469061876, |
|
"grad_norm": 0.7284447550773621, |
|
"learning_rate": 4.1729844717619684e-05, |
|
"loss": 0.4159, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.6387225548902196, |
|
"grad_norm": 0.708574652671814, |
|
"learning_rate": 4.17524996386651e-05, |
|
"loss": 0.3942, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6407185628742516, |
|
"grad_norm": 0.6826594471931458, |
|
"learning_rate": 4.177508387341454e-05, |
|
"loss": 0.3563, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.6427145708582834, |
|
"grad_norm": 0.7092903256416321, |
|
"learning_rate": 4.179759786159719e-05, |
|
"loss": 0.4169, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6447105788423154, |
|
"grad_norm": 0.6470283269882202, |
|
"learning_rate": 4.182004203885172e-05, |
|
"loss": 0.3595, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.6467065868263473, |
|
"grad_norm": 0.6560471057891846, |
|
"learning_rate": 4.184241683677687e-05, |
|
"loss": 0.3945, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6487025948103793, |
|
"grad_norm": 0.7021344900131226, |
|
"learning_rate": 4.1864722682981245e-05, |
|
"loss": 0.3682, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.6506986027944112, |
|
"grad_norm": 0.6736760139465332, |
|
"learning_rate": 4.188696000113232e-05, |
|
"loss": 0.4012, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6526946107784432, |
|
"grad_norm": 0.58335942029953, |
|
"learning_rate": 4.190912921100477e-05, |
|
"loss": 0.2982, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.654690618762475, |
|
"grad_norm": 0.7224960327148438, |
|
"learning_rate": 4.1931230728527994e-05, |
|
"loss": 0.3767, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.656686626746507, |
|
"grad_norm": 0.7125536203384399, |
|
"learning_rate": 4.195326496583291e-05, |
|
"loss": 0.3918, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.6586826347305389, |
|
"grad_norm": 0.7161789536476135, |
|
"learning_rate": 4.1975232331298125e-05, |
|
"loss": 0.3727, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6606786427145709, |
|
"grad_norm": 0.7045012712478638, |
|
"learning_rate": 4.1997133229595316e-05, |
|
"loss": 0.4168, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.6626746506986028, |
|
"grad_norm": 0.7229664921760559, |
|
"learning_rate": 4.201896806173394e-05, |
|
"loss": 0.406, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6646706586826348, |
|
"grad_norm": 0.6685640811920166, |
|
"learning_rate": 4.2040737225105335e-05, |
|
"loss": 0.3348, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.6416003108024597, |
|
"learning_rate": 4.206244111352608e-05, |
|
"loss": 0.3134, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6686626746506986, |
|
"grad_norm": 0.6860243082046509, |
|
"learning_rate": 4.2084080117280756e-05, |
|
"loss": 0.3855, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.6706586826347305, |
|
"grad_norm": 0.751287579536438, |
|
"learning_rate": 4.210565462316407e-05, |
|
"loss": 0.4388, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6726546906187625, |
|
"grad_norm": 0.7298620939254761, |
|
"learning_rate": 4.2127165014522315e-05, |
|
"loss": 0.4084, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6746506986027944, |
|
"grad_norm": 0.7535167336463928, |
|
"learning_rate": 4.214861167129425e-05, |
|
"loss": 0.3971, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6766467065868264, |
|
"grad_norm": 0.6288606524467468, |
|
"learning_rate": 4.2169994970051365e-05, |
|
"loss": 0.3184, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6786427145708582, |
|
"grad_norm": 0.6942071914672852, |
|
"learning_rate": 4.219131528403759e-05, |
|
"loss": 0.4085, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6806387225548902, |
|
"grad_norm": 0.7049132585525513, |
|
"learning_rate": 4.22125729832083e-05, |
|
"loss": 0.3799, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6826347305389222, |
|
"grad_norm": 0.6633714437484741, |
|
"learning_rate": 4.2233768434268914e-05, |
|
"loss": 0.3615, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6846307385229541, |
|
"grad_norm": 0.7143837809562683, |
|
"learning_rate": 4.225490200071284e-05, |
|
"loss": 0.397, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6866267465069861, |
|
"grad_norm": 0.6334770917892456, |
|
"learning_rate": 4.227597404285883e-05, |
|
"loss": 0.3192, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.688622754491018, |
|
"grad_norm": 0.6318526268005371, |
|
"learning_rate": 4.229698491788791e-05, |
|
"loss": 0.3409, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6906187624750499, |
|
"grad_norm": 0.6425897479057312, |
|
"learning_rate": 4.231793497987961e-05, |
|
"loss": 0.3506, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6926147704590818, |
|
"grad_norm": 0.6882063150405884, |
|
"learning_rate": 4.2338824579847904e-05, |
|
"loss": 0.3697, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6946107784431138, |
|
"grad_norm": 0.6814457774162292, |
|
"learning_rate": 4.235965406577636e-05, |
|
"loss": 0.4179, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6966067864271457, |
|
"grad_norm": 0.7089083790779114, |
|
"learning_rate": 4.2380423782653e-05, |
|
"loss": 0.358, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.6986027944111777, |
|
"grad_norm": 0.671987771987915, |
|
"learning_rate": 4.240113407250459e-05, |
|
"loss": 0.4223, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7005988023952096, |
|
"grad_norm": 0.6932473182678223, |
|
"learning_rate": 4.24217852744304e-05, |
|
"loss": 0.4283, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.7025948103792415, |
|
"grad_norm": 0.6401710510253906, |
|
"learning_rate": 4.244237772463552e-05, |
|
"loss": 0.3277, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.7045908183632734, |
|
"grad_norm": 0.5808695554733276, |
|
"learning_rate": 4.246291175646371e-05, |
|
"loss": 0.3153, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.7065868263473054, |
|
"grad_norm": 0.5929372310638428, |
|
"learning_rate": 4.24833877004298e-05, |
|
"loss": 0.2934, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.7085828343313373, |
|
"grad_norm": 0.6138365864753723, |
|
"learning_rate": 4.250380588425157e-05, |
|
"loss": 0.2647, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.7105788423153693, |
|
"grad_norm": 0.69126957654953, |
|
"learning_rate": 4.2524166632881255e-05, |
|
"loss": 0.3777, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.7125748502994012, |
|
"grad_norm": 0.618993878364563, |
|
"learning_rate": 4.254447026853656e-05, |
|
"loss": 0.2874, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.7145708582834331, |
|
"grad_norm": 0.6197064518928528, |
|
"learning_rate": 4.2564717110731244e-05, |
|
"loss": 0.3137, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.716566866267465, |
|
"grad_norm": 0.6574029326438904, |
|
"learning_rate": 4.258490747630532e-05, |
|
"loss": 0.3366, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 0.6827244162559509, |
|
"learning_rate": 4.260504167945479e-05, |
|
"loss": 0.367, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.720558882235529, |
|
"grad_norm": 0.6920093297958374, |
|
"learning_rate": 4.2625120031760965e-05, |
|
"loss": 0.3473, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.7225548902195609, |
|
"grad_norm": 0.6315056085586548, |
|
"learning_rate": 4.264514284221944e-05, |
|
"loss": 0.3477, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.7245508982035929, |
|
"grad_norm": 0.6894274950027466, |
|
"learning_rate": 4.266511041726854e-05, |
|
"loss": 0.3818, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.7265469061876247, |
|
"grad_norm": 0.7182605266571045, |
|
"learning_rate": 4.26850230608176e-05, |
|
"loss": 0.3959, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.7285429141716567, |
|
"grad_norm": 0.6431974172592163, |
|
"learning_rate": 4.2704881074274584e-05, |
|
"loss": 0.3484, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.7305389221556886, |
|
"grad_norm": 0.6523058414459229, |
|
"learning_rate": 4.272468475657351e-05, |
|
"loss": 0.3315, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.7325349301397206, |
|
"grad_norm": 0.7160993218421936, |
|
"learning_rate": 4.2744434404201497e-05, |
|
"loss": 0.3806, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.7345309381237525, |
|
"grad_norm": 0.6819020509719849, |
|
"learning_rate": 4.27641303112253e-05, |
|
"loss": 0.3889, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.7365269461077845, |
|
"grad_norm": 0.5881057381629944, |
|
"learning_rate": 4.278377276931767e-05, |
|
"loss": 0.2647, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.7385229540918163, |
|
"grad_norm": 1.0767422914505005, |
|
"learning_rate": 4.2803362067783256e-05, |
|
"loss": 0.3912, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7405189620758483, |
|
"grad_norm": 0.6878696084022522, |
|
"learning_rate": 4.2822898493584104e-05, |
|
"loss": 0.4216, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.7425149700598802, |
|
"grad_norm": 0.6871569752693176, |
|
"learning_rate": 4.284238233136496e-05, |
|
"loss": 0.395, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.7445109780439122, |
|
"grad_norm": 0.6874458193778992, |
|
"learning_rate": 4.286181386347813e-05, |
|
"loss": 0.3683, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.7465069860279441, |
|
"grad_norm": 0.6394293308258057, |
|
"learning_rate": 4.288119337000801e-05, |
|
"loss": 0.3518, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.7485029940119761, |
|
"grad_norm": 0.67393559217453, |
|
"learning_rate": 4.2900521128795315e-05, |
|
"loss": 0.4018, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.7504990019960079, |
|
"grad_norm": 0.6365067958831787, |
|
"learning_rate": 4.291979741546102e-05, |
|
"loss": 0.3719, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7524950099800399, |
|
"grad_norm": 0.6792694926261902, |
|
"learning_rate": 4.293902250342989e-05, |
|
"loss": 0.3623, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.7544910179640718, |
|
"grad_norm": 0.794163167476654, |
|
"learning_rate": 4.295819666395376e-05, |
|
"loss": 0.3945, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7564870259481038, |
|
"grad_norm": 0.7103076577186584, |
|
"learning_rate": 4.297732016613454e-05, |
|
"loss": 0.4585, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.7584830339321357, |
|
"grad_norm": 0.6877479553222656, |
|
"learning_rate": 4.299639327694684e-05, |
|
"loss": 0.4261, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7604790419161677, |
|
"grad_norm": 0.6512800455093384, |
|
"learning_rate": 4.3015416261260325e-05, |
|
"loss": 0.336, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.7624750499001997, |
|
"grad_norm": 0.6555919051170349, |
|
"learning_rate": 4.303438938186182e-05, |
|
"loss": 0.3949, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7644710578842315, |
|
"grad_norm": 0.6375437378883362, |
|
"learning_rate": 4.305331289947705e-05, |
|
"loss": 0.348, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.7664670658682635, |
|
"grad_norm": 0.6899069547653198, |
|
"learning_rate": 4.3072187072792184e-05, |
|
"loss": 0.3715, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7684630738522954, |
|
"grad_norm": 0.6571375727653503, |
|
"learning_rate": 4.309101215847502e-05, |
|
"loss": 0.3471, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.7704590818363274, |
|
"grad_norm": 0.6866909265518188, |
|
"learning_rate": 4.3109788411195924e-05, |
|
"loss": 0.3721, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7724550898203593, |
|
"grad_norm": 0.6416053175926208, |
|
"learning_rate": 4.312851608364853e-05, |
|
"loss": 0.3501, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.7744510978043913, |
|
"grad_norm": 0.6585414409637451, |
|
"learning_rate": 4.314719542657013e-05, |
|
"loss": 0.3446, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7764471057884231, |
|
"grad_norm": 0.6449529528617859, |
|
"learning_rate": 4.3165826688761796e-05, |
|
"loss": 0.31, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.7784431137724551, |
|
"grad_norm": 0.6616773009300232, |
|
"learning_rate": 4.318441011710833e-05, |
|
"loss": 0.3356, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.780439121756487, |
|
"grad_norm": 0.681754469871521, |
|
"learning_rate": 4.3202945956597786e-05, |
|
"loss": 0.3543, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.782435129740519, |
|
"grad_norm": 0.6211993098258972, |
|
"learning_rate": 4.3221434450340956e-05, |
|
"loss": 0.3157, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7844311377245509, |
|
"grad_norm": 0.6262781620025635, |
|
"learning_rate": 4.323987583959045e-05, |
|
"loss": 0.3533, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.7864271457085829, |
|
"grad_norm": 0.6640245318412781, |
|
"learning_rate": 4.325827036375957e-05, |
|
"loss": 0.3742, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7884231536926147, |
|
"grad_norm": 0.6164320111274719, |
|
"learning_rate": 4.327661826044101e-05, |
|
"loss": 0.3472, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7904191616766467, |
|
"grad_norm": 0.6439725756645203, |
|
"learning_rate": 4.329491976542521e-05, |
|
"loss": 0.359, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7924151696606786, |
|
"grad_norm": 0.7187615036964417, |
|
"learning_rate": 4.331317511271859e-05, |
|
"loss": 0.4445, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7944111776447106, |
|
"grad_norm": 0.660010039806366, |
|
"learning_rate": 4.333138453456147e-05, |
|
"loss": 0.3213, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7964071856287425, |
|
"grad_norm": 0.7590385675430298, |
|
"learning_rate": 4.334954826144581e-05, |
|
"loss": 0.3359, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7984031936127745, |
|
"grad_norm": 0.6344367861747742, |
|
"learning_rate": 4.336766652213271e-05, |
|
"loss": 0.3542, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8003992015968064, |
|
"grad_norm": 0.6679601073265076, |
|
"learning_rate": 4.338573954366971e-05, |
|
"loss": 0.3642, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.8023952095808383, |
|
"grad_norm": 0.6402161121368408, |
|
"learning_rate": 4.340376755140784e-05, |
|
"loss": 0.3603, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.8043912175648703, |
|
"grad_norm": 0.7084898948669434, |
|
"learning_rate": 4.342175076901849e-05, |
|
"loss": 0.3817, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.8063872255489022, |
|
"grad_norm": 0.6191865801811218, |
|
"learning_rate": 4.343968941851009e-05, |
|
"loss": 0.3017, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.8083832335329342, |
|
"grad_norm": 0.6750943660736084, |
|
"learning_rate": 4.345758372024448e-05, |
|
"loss": 0.3949, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.810379241516966, |
|
"grad_norm": 0.6468753814697266, |
|
"learning_rate": 4.347543389295324e-05, |
|
"loss": 0.3668, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.812375249500998, |
|
"grad_norm": 0.6904520988464355, |
|
"learning_rate": 4.3493240153753666e-05, |
|
"loss": 0.3499, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.8143712574850299, |
|
"grad_norm": 0.6204891800880432, |
|
"learning_rate": 4.3511002718164666e-05, |
|
"loss": 0.3304, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.8163672654690619, |
|
"grad_norm": 0.6633168458938599, |
|
"learning_rate": 4.352872180012237e-05, |
|
"loss": 0.3337, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.8183632734530938, |
|
"grad_norm": 0.8691318035125732, |
|
"learning_rate": 4.35463976119956e-05, |
|
"loss": 0.4502, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8203592814371258, |
|
"grad_norm": 0.7373143434524536, |
|
"learning_rate": 4.356403036460115e-05, |
|
"loss": 0.4128, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.8223552894211577, |
|
"grad_norm": 0.6885534524917603, |
|
"learning_rate": 4.3581620267218916e-05, |
|
"loss": 0.3341, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.8243512974051896, |
|
"grad_norm": 0.6862485408782959, |
|
"learning_rate": 4.359916752760669e-05, |
|
"loss": 0.3498, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.8263473053892215, |
|
"grad_norm": 0.6959711313247681, |
|
"learning_rate": 4.361667235201499e-05, |
|
"loss": 0.3796, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.8283433133732535, |
|
"grad_norm": 0.7265036106109619, |
|
"learning_rate": 4.363413494520154e-05, |
|
"loss": 0.3911, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.8303393213572854, |
|
"grad_norm": 0.6805566549301147, |
|
"learning_rate": 4.365155551044572e-05, |
|
"loss": 0.367, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.8323353293413174, |
|
"grad_norm": 0.6219791173934937, |
|
"learning_rate": 4.366893424956263e-05, |
|
"loss": 0.289, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.8343313373253493, |
|
"grad_norm": 0.6582449674606323, |
|
"learning_rate": 4.368627136291726e-05, |
|
"loss": 0.2747, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.8363273453093812, |
|
"grad_norm": 0.6985988616943359, |
|
"learning_rate": 4.370356704943825e-05, |
|
"loss": 0.3435, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 0.6607214212417603, |
|
"learning_rate": 4.372082150663168e-05, |
|
"loss": 0.3645, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8403193612774451, |
|
"grad_norm": 0.723174512386322, |
|
"learning_rate": 4.3738034930594475e-05, |
|
"loss": 0.3672, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.8423153692614771, |
|
"grad_norm": 0.6832453012466431, |
|
"learning_rate": 4.3755207516027904e-05, |
|
"loss": 0.3806, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.844311377245509, |
|
"grad_norm": 0.6922501921653748, |
|
"learning_rate": 4.377233945625071e-05, |
|
"loss": 0.4031, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.846307385229541, |
|
"grad_norm": 0.6647071242332458, |
|
"learning_rate": 4.378943094321221e-05, |
|
"loss": 0.3628, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.8483033932135728, |
|
"grad_norm": 0.6893953084945679, |
|
"learning_rate": 4.3806482167505196e-05, |
|
"loss": 0.3434, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.8502994011976048, |
|
"grad_norm": 0.8566087484359741, |
|
"learning_rate": 4.382349331837866e-05, |
|
"loss": 0.3803, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.8522954091816367, |
|
"grad_norm": 0.7948191165924072, |
|
"learning_rate": 4.3840464583750404e-05, |
|
"loss": 0.3627, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.8542914171656687, |
|
"grad_norm": 0.6731837391853333, |
|
"learning_rate": 4.385739615021954e-05, |
|
"loss": 0.395, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.8562874251497006, |
|
"grad_norm": 0.6760764122009277, |
|
"learning_rate": 4.387428820307874e-05, |
|
"loss": 0.3627, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.8582834331337326, |
|
"grad_norm": 0.7794198989868164, |
|
"learning_rate": 4.3891140926326446e-05, |
|
"loss": 0.3166, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8602794411177644, |
|
"grad_norm": 0.7948319911956787, |
|
"learning_rate": 4.390795450267886e-05, |
|
"loss": 0.3634, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.8622754491017964, |
|
"grad_norm": 0.6758688688278198, |
|
"learning_rate": 4.3924729113581876e-05, |
|
"loss": 0.3103, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8642714570858283, |
|
"grad_norm": 1.493560791015625, |
|
"learning_rate": 4.394146493922276e-05, |
|
"loss": 0.3551, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.8662674650698603, |
|
"grad_norm": 0.6501355171203613, |
|
"learning_rate": 4.395816215854185e-05, |
|
"loss": 0.3433, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.8682634730538922, |
|
"grad_norm": 0.7338974475860596, |
|
"learning_rate": 4.397482094924396e-05, |
|
"loss": 0.3748, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.8702594810379242, |
|
"grad_norm": 0.7021346688270569, |
|
"learning_rate": 4.399144148780977e-05, |
|
"loss": 0.3988, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.872255489021956, |
|
"grad_norm": 0.8264355659484863, |
|
"learning_rate": 4.400802394950703e-05, |
|
"loss": 0.3821, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.874251497005988, |
|
"grad_norm": 0.7332090139389038, |
|
"learning_rate": 4.402456850840166e-05, |
|
"loss": 0.3212, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8762475049900199, |
|
"grad_norm": 0.7158175706863403, |
|
"learning_rate": 4.4041075337368695e-05, |
|
"loss": 0.3014, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.8782435129740519, |
|
"grad_norm": 0.6871099472045898, |
|
"learning_rate": 4.405754460810312e-05, |
|
"loss": 0.3363, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8802395209580839, |
|
"grad_norm": 0.7581283450126648, |
|
"learning_rate": 4.407397649113065e-05, |
|
"loss": 0.3706, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.8822355289421158, |
|
"grad_norm": 0.7075430154800415, |
|
"learning_rate": 4.40903711558182e-05, |
|
"loss": 0.3625, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8842315369261478, |
|
"grad_norm": 0.6902301907539368, |
|
"learning_rate": 4.41067287703845e-05, |
|
"loss": 0.3459, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.8862275449101796, |
|
"grad_norm": 0.7632633447647095, |
|
"learning_rate": 4.412304950191033e-05, |
|
"loss": 0.3863, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.8882235528942116, |
|
"grad_norm": 0.8091756701469421, |
|
"learning_rate": 4.413933351634886e-05, |
|
"loss": 0.3873, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.8902195608782435, |
|
"grad_norm": 0.7229244709014893, |
|
"learning_rate": 4.4155580978535707e-05, |
|
"loss": 0.3199, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8922155688622755, |
|
"grad_norm": 0.6914481520652771, |
|
"learning_rate": 4.417179205219895e-05, |
|
"loss": 0.3679, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.8942115768463074, |
|
"grad_norm": 0.6364032030105591, |
|
"learning_rate": 4.418796689996907e-05, |
|
"loss": 0.2962, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8962075848303394, |
|
"grad_norm": 0.7445045113563538, |
|
"learning_rate": 4.420410568338872e-05, |
|
"loss": 0.4021, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.8982035928143712, |
|
"grad_norm": 0.6447579264640808, |
|
"learning_rate": 4.42202085629224e-05, |
|
"loss": 0.3129, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9001996007984032, |
|
"grad_norm": 0.7040254473686218, |
|
"learning_rate": 4.423627569796601e-05, |
|
"loss": 0.3672, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.9021956087824351, |
|
"grad_norm": 0.6750066876411438, |
|
"learning_rate": 4.425230724685638e-05, |
|
"loss": 0.4024, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.9041916167664671, |
|
"grad_norm": 0.7186387181282043, |
|
"learning_rate": 4.4268303366880536e-05, |
|
"loss": 0.355, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.906187624750499, |
|
"grad_norm": 0.7389270663261414, |
|
"learning_rate": 4.428426421428507e-05, |
|
"loss": 0.4207, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.908183632734531, |
|
"grad_norm": 0.6795611381530762, |
|
"learning_rate": 4.430018994428521e-05, |
|
"loss": 0.3068, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.9101796407185628, |
|
"grad_norm": 0.6613329648971558, |
|
"learning_rate": 4.431608071107392e-05, |
|
"loss": 0.3828, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.9121756487025948, |
|
"grad_norm": 0.7048102021217346, |
|
"learning_rate": 4.433193666783084e-05, |
|
"loss": 0.3921, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.9141716566866267, |
|
"grad_norm": 0.7187650203704834, |
|
"learning_rate": 4.4347757966731156e-05, |
|
"loss": 0.2997, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.9161676646706587, |
|
"grad_norm": 0.7008907794952393, |
|
"learning_rate": 4.436354475895436e-05, |
|
"loss": 0.3478, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.9181636726546906, |
|
"grad_norm": 0.6574254035949707, |
|
"learning_rate": 4.437929719469291e-05, |
|
"loss": 0.317, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.9201596806387226, |
|
"grad_norm": 0.6908730864524841, |
|
"learning_rate": 4.4395015423160807e-05, |
|
"loss": 0.3268, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.9221556886227545, |
|
"grad_norm": 0.676114559173584, |
|
"learning_rate": 4.4410699592602094e-05, |
|
"loss": 0.3791, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.9241516966067864, |
|
"grad_norm": 0.6226547956466675, |
|
"learning_rate": 4.442634985029922e-05, |
|
"loss": 0.36, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.9261477045908184, |
|
"grad_norm": 0.6422531604766846, |
|
"learning_rate": 4.444196634258136e-05, |
|
"loss": 0.379, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.9281437125748503, |
|
"grad_norm": 0.7371797561645508, |
|
"learning_rate": 4.4457549214832566e-05, |
|
"loss": 0.3696, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.9301397205588823, |
|
"grad_norm": 0.6225396394729614, |
|
"learning_rate": 4.44730986115e-05, |
|
"loss": 0.345, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.9321357285429142, |
|
"grad_norm": 0.6568498611450195, |
|
"learning_rate": 4.448861467610187e-05, |
|
"loss": 0.4367, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.9341317365269461, |
|
"grad_norm": 0.6361973881721497, |
|
"learning_rate": 4.4504097551235406e-05, |
|
"loss": 0.3615, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.936127744510978, |
|
"grad_norm": 0.5645039081573486, |
|
"learning_rate": 4.4519547378584725e-05, |
|
"loss": 0.2511, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.93812375249501, |
|
"grad_norm": 1.5839265584945679, |
|
"learning_rate": 4.453496429892863e-05, |
|
"loss": 0.3438, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9401197604790419, |
|
"grad_norm": 0.7127808928489685, |
|
"learning_rate": 4.455034845214827e-05, |
|
"loss": 0.4078, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.9421157684630739, |
|
"grad_norm": 0.9536606073379517, |
|
"learning_rate": 4.4565699977234796e-05, |
|
"loss": 0.3297, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.9441117764471058, |
|
"grad_norm": 0.6458728313446045, |
|
"learning_rate": 4.458101901229686e-05, |
|
"loss": 0.3305, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 0.9461077844311377, |
|
"grad_norm": 0.7509250640869141, |
|
"learning_rate": 4.459630569456809e-05, |
|
"loss": 0.345, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.9481037924151696, |
|
"grad_norm": 2.1286840438842773, |
|
"learning_rate": 4.461156016041444e-05, |
|
"loss": 0.4174, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.9500998003992016, |
|
"grad_norm": 0.668644905090332, |
|
"learning_rate": 4.462678254534156e-05, |
|
"loss": 0.3657, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.9520958083832335, |
|
"grad_norm": 0.7153406739234924, |
|
"learning_rate": 4.464197298400191e-05, |
|
"loss": 0.3401, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 0.9540918163672655, |
|
"grad_norm": 0.62980717420578, |
|
"learning_rate": 4.4657131610201994e-05, |
|
"loss": 0.316, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.9560878243512974, |
|
"grad_norm": 0.733650803565979, |
|
"learning_rate": 4.467225855690939e-05, |
|
"loss": 0.4096, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 0.9371464252471924, |
|
"learning_rate": 4.468735395625979e-05, |
|
"loss": 0.4383, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9600798403193613, |
|
"grad_norm": 0.6547588109970093, |
|
"learning_rate": 4.470241793956387e-05, |
|
"loss": 0.3269, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 0.9620758483033932, |
|
"grad_norm": 0.6767633557319641, |
|
"learning_rate": 4.471745063731416e-05, |
|
"loss": 0.338, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.9640718562874252, |
|
"grad_norm": 0.691611111164093, |
|
"learning_rate": 4.473245217919187e-05, |
|
"loss": 0.3583, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 0.9660678642714571, |
|
"grad_norm": 0.6319297552108765, |
|
"learning_rate": 4.474742269407355e-05, |
|
"loss": 0.333, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9680638722554891, |
|
"grad_norm": 0.6804649829864502, |
|
"learning_rate": 4.476236231003773e-05, |
|
"loss": 0.388, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.9700598802395209, |
|
"grad_norm": 0.7119168043136597, |
|
"learning_rate": 4.477727115437156e-05, |
|
"loss": 0.3867, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.9720558882235529, |
|
"grad_norm": 0.6172801852226257, |
|
"learning_rate": 4.479214935357724e-05, |
|
"loss": 0.312, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 0.9740518962075848, |
|
"grad_norm": 0.8452144265174866, |
|
"learning_rate": 4.480699703337852e-05, |
|
"loss": 0.4059, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9760479041916168, |
|
"grad_norm": 0.6802703142166138, |
|
"learning_rate": 4.4821814318727016e-05, |
|
"loss": 0.3789, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 0.9780439121756487, |
|
"grad_norm": 0.6583143472671509, |
|
"learning_rate": 4.483660133380856e-05, |
|
"loss": 0.3354, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9800399201596807, |
|
"grad_norm": 0.6605017781257629, |
|
"learning_rate": 4.485135820204948e-05, |
|
"loss": 0.3842, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 0.9820359281437125, |
|
"grad_norm": 0.7111901640892029, |
|
"learning_rate": 4.486608504612267e-05, |
|
"loss": 0.432, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9840319361277445, |
|
"grad_norm": 0.6553547978401184, |
|
"learning_rate": 4.488078198795383e-05, |
|
"loss": 0.3503, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 0.9860279441117764, |
|
"grad_norm": 0.8542457818984985, |
|
"learning_rate": 4.489544914872745e-05, |
|
"loss": 0.354, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9880239520958084, |
|
"grad_norm": 0.680438220500946, |
|
"learning_rate": 4.4910086648892815e-05, |
|
"loss": 0.3528, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.9900199600798403, |
|
"grad_norm": 0.6407065987586975, |
|
"learning_rate": 4.4924694608169965e-05, |
|
"loss": 0.3698, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9920159680638723, |
|
"grad_norm": 0.6616628170013428, |
|
"learning_rate": 4.4939273145555536e-05, |
|
"loss": 0.3878, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 0.9940119760479041, |
|
"grad_norm": 0.617494523525238, |
|
"learning_rate": 4.495382237932863e-05, |
|
"loss": 0.3155, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9960079840319361, |
|
"grad_norm": 0.672020435333252, |
|
"learning_rate": 4.4968342427056505e-05, |
|
"loss": 0.3425, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 0.998003992015968, |
|
"grad_norm": 0.6575382351875305, |
|
"learning_rate": 4.498283340560031e-05, |
|
"loss": 0.3599, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6533491015434265, |
|
"learning_rate": 4.499729543112076e-05, |
|
"loss": 0.3201, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 501, |
|
"total_flos": 5.842272600604017e+17, |
|
"train_loss": 0.47904590670458097, |
|
"train_runtime": 1388.8864, |
|
"train_samples_per_second": 2.881, |
|
"train_steps_per_second": 0.361 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 501, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.842272600604017e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|