diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 1.5609192848205566, - "best_model_checkpoint": "model/chessformer-3/checkpoint-6000", + "best_metric": 1.5398365259170532, + "best_model_checkpoint": "model/chessformer-3/checkpoint-7000", "epoch": 0.3112356053532524, "eval_steps": 500, "global_step": 7000, @@ -21106,3518 +21106,3518 @@ }, { "epoch": 0.26686230047574583, - "grad_norm": 0.09383466094732285, + "grad_norm": 0.09536412358283997, "learning_rate": 0.0009922200143770103, - "loss": 1.5809, + "loss": 1.5798, "step": 6002 }, { "epoch": 0.2669512249344182, - "grad_norm": 0.09044884890317917, + "grad_norm": 0.0923578068614006, "learning_rate": 0.000992213807834307, - "loss": 1.5895, + "loss": 1.5833, "step": 6004 }, { "epoch": 0.26704014939309056, - "grad_norm": 0.09137442708015442, + "grad_norm": 0.0896889865398407, "learning_rate": 0.00099220759883636, - "loss": 1.5823, + "loss": 1.5824, "step": 6006 }, { "epoch": 0.26712907385176293, - "grad_norm": 0.09045607596635818, + "grad_norm": 0.0965195745229721, "learning_rate": 0.0009922013873831994, - "loss": 1.5818, + "loss": 1.5842, "step": 6008 }, { "epoch": 0.2672179983104353, - "grad_norm": 0.09125877171754837, + "grad_norm": 0.09298788011074066, "learning_rate": 0.0009921951734748566, - "loss": 1.5742, + "loss": 1.575, "step": 6010 }, { "epoch": 0.26730692276910767, - "grad_norm": 0.09353230893611908, + "grad_norm": 0.0938933789730072, "learning_rate": 0.0009921889571113628, - "loss": 1.5855, + "loss": 1.5787, "step": 6012 }, { "epoch": 0.26739584722778, - "grad_norm": 0.09157554805278778, + "grad_norm": 0.09369226545095444, "learning_rate": 0.0009921827382927486, - "loss": 1.5821, + "loss": 1.5808, "step": 6014 }, { "epoch": 0.26748477168645235, - "grad_norm": 0.09111808240413666, + "grad_norm": 0.09126924723386765, "learning_rate": 0.0009921765170190454, - "loss": 1.583, + "loss": 1.5752, "step": 6016 }, { "epoch": 0.2675736961451247, - "grad_norm": 0.08982241153717041, + "grad_norm": 0.09294477850198746, "learning_rate": 0.0009921702932902839, - "loss": 1.5846, + "loss": 1.5761, "step": 6018 }, { "epoch": 0.2676626206037971, - "grad_norm": 0.09271515905857086, + "grad_norm": 0.09301607310771942, "learning_rate": 0.0009921640671064953, - "loss": 1.5822, + "loss": 1.5772, "step": 6020 }, { "epoch": 0.26775154506246945, - "grad_norm": 0.08831150084733963, + "grad_norm": 0.09419137239456177, "learning_rate": 0.0009921578384677106, - "loss": 1.5833, + "loss": 1.5782, "step": 6022 }, { "epoch": 0.26784046952114177, - "grad_norm": 0.09009987115859985, + "grad_norm": 0.09414952993392944, "learning_rate": 0.000992151607373961, - "loss": 1.5828, + "loss": 1.5785, "step": 6024 }, { "epoch": 0.26792939397981413, - "grad_norm": 0.09657534956932068, + "grad_norm": 0.09365058690309525, "learning_rate": 0.0009921453738252774, - "loss": 1.5829, + "loss": 1.584, "step": 6026 }, { "epoch": 0.2680183184384865, - "grad_norm": 0.09146089851856232, + "grad_norm": 0.09278295934200287, "learning_rate": 0.000992139137821691, - "loss": 1.5837, + "loss": 1.5855, "step": 6028 }, { "epoch": 0.26810724289715887, - "grad_norm": 0.08794407546520233, + "grad_norm": 0.09213846176862717, "learning_rate": 0.000992132899363233, - "loss": 1.5822, + "loss": 1.5798, "step": 6030 }, { "epoch": 0.26819616735583124, - "grad_norm": 0.09218079596757889, + "grad_norm": 0.0945931226015091, "learning_rate": 0.0009921266584499344, - "loss": 1.5837, + "loss": 1.5792, "step": 6032 }, { "epoch": 0.2682850918145036, - "grad_norm": 0.09184197336435318, + "grad_norm": 0.09439346194267273, "learning_rate": 0.000992120415081826, - "loss": 1.5832, + "loss": 1.5722, "step": 6034 }, { "epoch": 0.2683740162731759, - "grad_norm": 0.09737716615200043, + "grad_norm": 0.09740027040243149, "learning_rate": 0.0009921141692589396, - "loss": 1.5813, + "loss": 1.582, "step": 6036 }, { "epoch": 0.2684629407318483, - "grad_norm": 0.09055211395025253, + "grad_norm": 0.0935557633638382, "learning_rate": 0.0009921079209813063, - "loss": 1.5764, + "loss": 1.581, "step": 6038 }, { "epoch": 0.26855186519052066, - "grad_norm": 0.0906216949224472, + "grad_norm": 0.09402929991483688, "learning_rate": 0.0009921016702489564, - "loss": 1.5864, + "loss": 1.5772, "step": 6040 }, { "epoch": 0.268640789649193, - "grad_norm": 0.08897050470113754, + "grad_norm": 0.09664035588502884, "learning_rate": 0.000992095417061922, - "loss": 1.5817, + "loss": 1.5807, "step": 6042 }, { "epoch": 0.2687297141078654, - "grad_norm": 0.09858798235654831, + "grad_norm": 0.09566997736692429, "learning_rate": 0.000992089161420234, - "loss": 1.5811, + "loss": 1.5822, "step": 6044 }, { "epoch": 0.2688186385665377, - "grad_norm": 0.10423325002193451, + "grad_norm": 0.09569090604782104, "learning_rate": 0.0009920829033239235, - "loss": 1.5783, + "loss": 1.5799, "step": 6046 }, { "epoch": 0.2689075630252101, - "grad_norm": 0.09769600629806519, + "grad_norm": 0.09623146057128906, "learning_rate": 0.0009920766427730218, - "loss": 1.5877, + "loss": 1.5803, "step": 6048 }, { "epoch": 0.26899648748388244, - "grad_norm": 0.10101058334112167, + "grad_norm": 0.09390247613191605, "learning_rate": 0.0009920703797675603, - "loss": 1.5815, + "loss": 1.58, "step": 6050 }, { "epoch": 0.2690854119425548, - "grad_norm": 0.09779931604862213, + "grad_norm": 0.09540102630853653, "learning_rate": 0.0009920641143075699, - "loss": 1.5851, + "loss": 1.5818, "step": 6052 }, { "epoch": 0.2691743364012272, - "grad_norm": 0.0930139422416687, + "grad_norm": 0.09371043741703033, "learning_rate": 0.000992057846393082, - "loss": 1.5868, + "loss": 1.5885, "step": 6054 }, { "epoch": 0.2692632608598995, - "grad_norm": 0.09557168185710907, + "grad_norm": 0.09126854687929153, "learning_rate": 0.000992051576024128, - "loss": 1.5807, + "loss": 1.5794, "step": 6056 }, { "epoch": 0.26935218531857186, - "grad_norm": 0.09590741991996765, + "grad_norm": 0.09539595991373062, "learning_rate": 0.0009920453032007388, - "loss": 1.5827, + "loss": 1.5784, "step": 6058 }, { "epoch": 0.2694411097772442, - "grad_norm": 0.09438788145780563, + "grad_norm": 0.09004338830709457, "learning_rate": 0.0009920390279229462, - "loss": 1.5828, + "loss": 1.5753, "step": 6060 }, { "epoch": 0.2695300342359166, - "grad_norm": 0.09137403964996338, + "grad_norm": 0.09474993497133255, "learning_rate": 0.0009920327501907811, - "loss": 1.5878, + "loss": 1.5854, "step": 6062 }, { "epoch": 0.26961895869458896, - "grad_norm": 0.09171666204929352, + "grad_norm": 0.10050513595342636, "learning_rate": 0.000992026470004275, - "loss": 1.5858, + "loss": 1.5815, "step": 6064 }, { "epoch": 0.26970788315326133, - "grad_norm": 0.09843774139881134, + "grad_norm": 0.09140043705701828, "learning_rate": 0.0009920201873634593, - "loss": 1.5875, + "loss": 1.58, "step": 6066 }, { "epoch": 0.26979680761193364, - "grad_norm": 0.09668837487697601, + "grad_norm": 0.09281838685274124, "learning_rate": 0.0009920139022683652, - "loss": 1.5772, + "loss": 1.5788, "step": 6068 }, { "epoch": 0.269885732070606, - "grad_norm": 0.09147980064153671, + "grad_norm": 0.0929790586233139, "learning_rate": 0.0009920076147190242, - "loss": 1.5843, + "loss": 1.5826, "step": 6070 }, { "epoch": 0.2699746565292784, - "grad_norm": 0.0925716683268547, + "grad_norm": 0.09214480966329575, "learning_rate": 0.0009920013247154674, - "loss": 1.5818, + "loss": 1.5772, "step": 6072 }, { "epoch": 0.27006358098795075, - "grad_norm": 0.08949911594390869, + "grad_norm": 0.09295648336410522, "learning_rate": 0.0009919950322577263, - "loss": 1.5794, + "loss": 1.5797, "step": 6074 }, { "epoch": 0.2701525054466231, - "grad_norm": 0.09948768466711044, + "grad_norm": 0.094325952231884, "learning_rate": 0.0009919887373458325, "loss": 1.586, "step": 6076 }, { "epoch": 0.2702414299052954, - "grad_norm": 0.09083034843206406, + "grad_norm": 0.09574458748102188, "learning_rate": 0.0009919824399798169, - "loss": 1.5781, + "loss": 1.5852, "step": 6078 }, { "epoch": 0.2703303543639678, - "grad_norm": 0.0930861383676529, + "grad_norm": 0.09337875992059708, "learning_rate": 0.0009919761401597114, - "loss": 1.5862, + "loss": 1.5748, "step": 6080 }, { "epoch": 0.27041927882264016, - "grad_norm": 0.09269826114177704, + "grad_norm": 0.09532225131988525, "learning_rate": 0.0009919698378855473, - "loss": 1.5811, + "loss": 1.5696, "step": 6082 }, { "epoch": 0.27050820328131253, - "grad_norm": 0.09123250097036362, + "grad_norm": 0.09588050097227097, "learning_rate": 0.000991963533157356, - "loss": 1.5804, + "loss": 1.585, "step": 6084 }, { "epoch": 0.2705971277399849, - "grad_norm": 0.08713556826114655, + "grad_norm": 0.09399744123220444, "learning_rate": 0.0009919572259751688, - "loss": 1.5828, + "loss": 1.5727, "step": 6086 }, { "epoch": 0.27068605219865727, - "grad_norm": 0.09103381633758545, + "grad_norm": 0.09603196382522583, "learning_rate": 0.0009919509163390174, - "loss": 1.578, + "loss": 1.5792, "step": 6088 }, { "epoch": 0.2707749766573296, - "grad_norm": 0.08987361192703247, + "grad_norm": 0.09013631939888, "learning_rate": 0.000991944604248933, - "loss": 1.5771, + "loss": 1.5795, "step": 6090 }, { "epoch": 0.27086390111600195, - "grad_norm": 0.08781355619430542, + "grad_norm": 0.08968756347894669, "learning_rate": 0.0009919382897049475, - "loss": 1.583, + "loss": 1.574, "step": 6092 }, { "epoch": 0.2709528255746743, - "grad_norm": 0.08987694978713989, + "grad_norm": 0.090090312063694, "learning_rate": 0.0009919319727070919, - "loss": 1.5835, + "loss": 1.5763, "step": 6094 }, { "epoch": 0.2710417500333467, - "grad_norm": 0.09166261553764343, + "grad_norm": 0.09772195667028427, "learning_rate": 0.0009919256532553982, - "loss": 1.5789, + "loss": 1.5812, "step": 6096 }, { "epoch": 0.27113067449201905, - "grad_norm": 0.09571800380945206, + "grad_norm": 0.09192989021539688, "learning_rate": 0.0009919193313498976, - "loss": 1.5791, + "loss": 1.5723, "step": 6098 }, { "epoch": 0.27121959895069137, - "grad_norm": 0.08988688141107559, + "grad_norm": 0.091171033680439, "learning_rate": 0.0009919130069906217, - "loss": 1.572, + "loss": 1.5751, "step": 6100 }, { "epoch": 0.27130852340936373, - "grad_norm": 0.09290190041065216, + "grad_norm": 0.0916176363825798, "learning_rate": 0.000991906680177602, - "loss": 1.5869, + "loss": 1.5787, "step": 6102 }, { "epoch": 0.2713974478680361, - "grad_norm": 0.09076294302940369, + "grad_norm": 0.09237353503704071, "learning_rate": 0.0009919003509108702, - "loss": 1.5789, + "loss": 1.5746, "step": 6104 }, { "epoch": 0.27148637232670847, - "grad_norm": 0.09125485271215439, + "grad_norm": 0.09654420614242554, "learning_rate": 0.0009918940191904577, - "loss": 1.5809, + "loss": 1.5804, "step": 6106 }, { "epoch": 0.27157529678538084, - "grad_norm": 0.08847089856863022, + "grad_norm": 0.09240265935659409, "learning_rate": 0.0009918876850163966, - "loss": 1.5798, + "loss": 1.5769, "step": 6108 }, { "epoch": 0.27166422124405315, - "grad_norm": 0.09034041315317154, + "grad_norm": 0.09678072482347488, "learning_rate": 0.0009918813483887176, - "loss": 1.5793, + "loss": 1.5787, "step": 6110 }, { "epoch": 0.2717531457027255, - "grad_norm": 0.09318964928388596, + "grad_norm": 0.09163809567689896, "learning_rate": 0.000991875009307453, - "loss": 1.5787, + "loss": 1.581, "step": 6112 }, { "epoch": 0.2718420701613979, - "grad_norm": 0.08992137014865875, + "grad_norm": 0.09179195761680603, "learning_rate": 0.000991868667772634, - "loss": 1.5787, + "loss": 1.5799, "step": 6114 }, { "epoch": 0.27193099462007025, - "grad_norm": 0.0950603038072586, + "grad_norm": 0.09309026598930359, "learning_rate": 0.0009918623237842926, - "loss": 1.5794, + "loss": 1.5749, "step": 6116 }, { "epoch": 0.2720199190787426, - "grad_norm": 0.09443898499011993, + "grad_norm": 0.09055127948522568, "learning_rate": 0.0009918559773424603, - "loss": 1.5817, + "loss": 1.5776, "step": 6118 }, { "epoch": 0.272108843537415, - "grad_norm": 0.1001880094408989, + "grad_norm": 0.09597539901733398, "learning_rate": 0.0009918496284471688, - "loss": 1.5882, + "loss": 1.5733, "step": 6120 }, { "epoch": 0.2721977679960873, - "grad_norm": 0.0912918969988823, + "grad_norm": 0.09350308775901794, "learning_rate": 0.0009918432770984495, - "loss": 1.5879, + "loss": 1.5775, "step": 6122 }, { "epoch": 0.27228669245475967, - "grad_norm": 0.0911942571401596, + "grad_norm": 0.09235924482345581, "learning_rate": 0.0009918369232963344, - "loss": 1.587, + "loss": 1.5705, "step": 6124 }, { "epoch": 0.27237561691343204, - "grad_norm": 0.09879928827285767, + "grad_norm": 0.09772805869579315, "learning_rate": 0.0009918305670408552, - "loss": 1.5862, + "loss": 1.5795, "step": 6126 }, { "epoch": 0.2724645413721044, - "grad_norm": 0.09580254554748535, + "grad_norm": 0.09387152642011642, "learning_rate": 0.0009918242083320433, - "loss": 1.5782, + "loss": 1.5857, "step": 6128 }, { "epoch": 0.2725534658307768, - "grad_norm": 0.09700783342123032, + "grad_norm": 0.08818736672401428, "learning_rate": 0.0009918178471699309, - "loss": 1.5786, + "loss": 1.5767, "step": 6130 }, { "epoch": 0.2726423902894491, - "grad_norm": 0.0989309698343277, + "grad_norm": 0.09092088788747787, "learning_rate": 0.000991811483554549, - "loss": 1.5916, + "loss": 1.5775, "step": 6132 }, { "epoch": 0.27273131474812146, - "grad_norm": 0.09456934779882431, + "grad_norm": 0.09008492529392242, "learning_rate": 0.0009918051174859302, - "loss": 1.5819, + "loss": 1.5767, "step": 6134 }, { "epoch": 0.2728202392067938, - "grad_norm": 0.08964252471923828, + "grad_norm": 0.0906415656208992, "learning_rate": 0.0009917987489641057, - "loss": 1.577, + "loss": 1.5746, "step": 6136 }, { "epoch": 0.2729091636654662, - "grad_norm": 0.10097464174032211, + "grad_norm": 0.09450238943099976, "learning_rate": 0.0009917923779891073, - "loss": 1.584, + "loss": 1.5732, "step": 6138 }, { "epoch": 0.27299808812413856, - "grad_norm": 0.09702183306217194, + "grad_norm": 0.08837480843067169, "learning_rate": 0.000991786004560967, - "loss": 1.5841, + "loss": 1.5756, "step": 6140 }, { "epoch": 0.27308701258281093, - "grad_norm": 0.10218867659568787, + "grad_norm": 0.09205223619937897, "learning_rate": 0.0009917796286797163, - "loss": 1.5817, + "loss": 1.5763, "step": 6142 }, { "epoch": 0.27317593704148324, - "grad_norm": 0.09574905782938004, + "grad_norm": 0.0925694927573204, "learning_rate": 0.0009917732503453873, - "loss": 1.5808, + "loss": 1.5766, "step": 6144 }, { "epoch": 0.2732648615001556, - "grad_norm": 0.09571053087711334, + "grad_norm": 0.09327147156000137, "learning_rate": 0.0009917668695580118, - "loss": 1.5801, + "loss": 1.5742, "step": 6146 }, { "epoch": 0.273353785958828, - "grad_norm": 0.09228259325027466, + "grad_norm": 0.09639228135347366, "learning_rate": 0.0009917604863176215, - "loss": 1.5786, + "loss": 1.5805, "step": 6148 }, { "epoch": 0.27344271041750035, - "grad_norm": 0.09447211027145386, + "grad_norm": 0.09437435865402222, "learning_rate": 0.0009917541006242482, - "loss": 1.5759, + "loss": 1.5835, "step": 6150 }, { "epoch": 0.2735316348761727, - "grad_norm": 0.09275129437446594, + "grad_norm": 0.09421025216579437, "learning_rate": 0.0009917477124779237, - "loss": 1.5876, + "loss": 1.5827, "step": 6152 }, { "epoch": 0.273620559334845, - "grad_norm": 0.09325329959392548, + "grad_norm": 0.09246162325143814, "learning_rate": 0.0009917413218786802, - "loss": 1.5822, + "loss": 1.5771, "step": 6154 }, { "epoch": 0.2737094837935174, - "grad_norm": 0.09562984108924866, + "grad_norm": 0.0891774520277977, "learning_rate": 0.0009917349288265493, - "loss": 1.5852, + "loss": 1.5819, "step": 6156 }, { "epoch": 0.27379840825218976, - "grad_norm": 0.09172508865594864, + "grad_norm": 0.09381581842899323, "learning_rate": 0.0009917285333215628, - "loss": 1.5799, + "loss": 1.5758, "step": 6158 }, { "epoch": 0.27388733271086213, - "grad_norm": 0.09300266951322556, + "grad_norm": 0.09442843496799469, "learning_rate": 0.000991722135363753, - "loss": 1.5707, + "loss": 1.5837, "step": 6160 }, { "epoch": 0.2739762571695345, - "grad_norm": 0.09159931540489197, + "grad_norm": 0.08775163441896439, "learning_rate": 0.0009917157349531513, - "loss": 1.5772, + "loss": 1.581, "step": 6162 }, { "epoch": 0.2740651816282068, - "grad_norm": 0.10545887053012848, + "grad_norm": 0.0929328128695488, "learning_rate": 0.00099170933208979, - "loss": 1.5871, + "loss": 1.5758, "step": 6164 }, { "epoch": 0.2741541060868792, - "grad_norm": 0.10513417422771454, + "grad_norm": 0.09051446616649628, "learning_rate": 0.000991702926773701, - "loss": 1.5862, + "loss": 1.5774, "step": 6166 }, { "epoch": 0.27424303054555155, - "grad_norm": 0.09930409491062164, + "grad_norm": 0.09004546701908112, "learning_rate": 0.0009916965190049161, - "loss": 1.5761, + "loss": 1.5744, "step": 6168 }, { "epoch": 0.2743319550042239, - "grad_norm": 0.09921613335609436, + "grad_norm": 0.0947934165596962, "learning_rate": 0.0009916901087834676, - "loss": 1.5813, + "loss": 1.5806, "step": 6170 }, { "epoch": 0.2744208794628963, - "grad_norm": 0.10508844256401062, + "grad_norm": 0.0934051126241684, "learning_rate": 0.0009916836961093869, - "loss": 1.5816, + "loss": 1.5796, "step": 6172 }, { "epoch": 0.27450980392156865, - "grad_norm": 0.10168485343456268, + "grad_norm": 0.09276668727397919, "learning_rate": 0.0009916772809827063, - "loss": 1.5808, + "loss": 1.5775, "step": 6174 }, { "epoch": 0.27459872838024096, - "grad_norm": 0.08906201273202896, + "grad_norm": 0.09658815711736679, "learning_rate": 0.0009916708634034582, - "loss": 1.579, + "loss": 1.5745, "step": 6176 }, { "epoch": 0.27468765283891333, - "grad_norm": 0.09412112832069397, + "grad_norm": 0.09510058909654617, "learning_rate": 0.0009916644433716738, - "loss": 1.5884, + "loss": 1.5706, "step": 6178 }, { "epoch": 0.2747765772975857, - "grad_norm": 0.08810826390981674, + "grad_norm": 0.09232553839683533, "learning_rate": 0.0009916580208873859, - "loss": 1.5786, + "loss": 1.5735, "step": 6180 }, { "epoch": 0.27486550175625807, - "grad_norm": 0.09497103095054626, + "grad_norm": 0.09260194003582001, "learning_rate": 0.0009916515959506257, - "loss": 1.5813, + "loss": 1.5781, "step": 6182 }, { "epoch": 0.27495442621493044, - "grad_norm": 0.09796647727489471, + "grad_norm": 0.09699893742799759, "learning_rate": 0.000991645168561426, - "loss": 1.5816, + "loss": 1.5788, "step": 6184 }, { "epoch": 0.27504335067360275, - "grad_norm": 0.08955618739128113, + "grad_norm": 0.0900624617934227, "learning_rate": 0.0009916387387198186, - "loss": 1.5805, + "loss": 1.5794, "step": 6186 }, { "epoch": 0.2751322751322751, - "grad_norm": 0.09655381739139557, + "grad_norm": 0.08997295051813126, "learning_rate": 0.0009916323064258358, - "loss": 1.5803, + "loss": 1.5737, "step": 6188 }, { "epoch": 0.2752211995909475, - "grad_norm": 0.09500830620527267, + "grad_norm": 0.0929020345211029, "learning_rate": 0.000991625871679509, - "loss": 1.5835, + "loss": 1.5808, "step": 6190 }, { "epoch": 0.27531012404961985, - "grad_norm": 0.09726283699274063, + "grad_norm": 0.09494993090629578, "learning_rate": 0.000991619434480871, - "loss": 1.5799, + "loss": 1.5808, "step": 6192 }, { "epoch": 0.2753990485082922, - "grad_norm": 0.0956970602273941, + "grad_norm": 0.08931610733270645, "learning_rate": 0.0009916129948299537, - "loss": 1.5802, + "loss": 1.5718, "step": 6194 }, { "epoch": 0.2754879729669646, - "grad_norm": 0.08765096962451935, + "grad_norm": 0.09554082900285721, "learning_rate": 0.000991606552726789, - "loss": 1.5856, + "loss": 1.5794, "step": 6196 }, { "epoch": 0.2755768974256369, - "grad_norm": 0.09525011479854584, + "grad_norm": 0.09193310141563416, "learning_rate": 0.0009916001081714091, - "loss": 1.5797, + "loss": 1.5806, "step": 6198 }, { "epoch": 0.27566582188430927, - "grad_norm": 0.09269862622022629, + "grad_norm": 0.08917775005102158, "learning_rate": 0.0009915936611638466, - "loss": 1.5804, + "loss": 1.5737, "step": 6200 }, { "epoch": 0.27575474634298164, - "grad_norm": 0.09025751799345016, + "grad_norm": 0.0933012068271637, "learning_rate": 0.0009915872117041333, - "loss": 1.5843, + "loss": 1.5803, "step": 6202 }, { "epoch": 0.275843670801654, - "grad_norm": 0.0937366932630539, + "grad_norm": 0.09677395969629288, "learning_rate": 0.0009915807597923013, - "loss": 1.5795, + "loss": 1.5793, "step": 6204 }, { "epoch": 0.2759325952603264, - "grad_norm": 0.0941375270485878, + "grad_norm": 0.09007348865270615, "learning_rate": 0.0009915743054283827, - "loss": 1.5762, + "loss": 1.5755, "step": 6206 }, { "epoch": 0.2760215197189987, - "grad_norm": 0.09379082173109055, + "grad_norm": 0.09194600582122803, "learning_rate": 0.00099156784861241, - "loss": 1.5774, + "loss": 1.5735, "step": 6208 }, { "epoch": 0.27611044417767105, - "grad_norm": 0.09712362289428711, + "grad_norm": 0.09006384760141373, "learning_rate": 0.0009915613893444154, - "loss": 1.5784, + "loss": 1.574, "step": 6210 }, { "epoch": 0.2761993686363434, - "grad_norm": 0.09576690942049026, + "grad_norm": 0.09457103908061981, "learning_rate": 0.0009915549276244308, - "loss": 1.5797, + "loss": 1.577, "step": 6212 }, { "epoch": 0.2762882930950158, - "grad_norm": 0.09705163538455963, + "grad_norm": 0.09670677036046982, "learning_rate": 0.0009915484634524887, - "loss": 1.5831, + "loss": 1.5838, "step": 6214 }, { "epoch": 0.27637721755368816, - "grad_norm": 0.094864122569561, + "grad_norm": 0.09744661301374435, "learning_rate": 0.0009915419968286212, - "loss": 1.5829, + "loss": 1.58, "step": 6216 }, { "epoch": 0.2764661420123605, - "grad_norm": 0.08811978250741959, + "grad_norm": 0.09463924914598465, "learning_rate": 0.0009915355277528607, - "loss": 1.5844, + "loss": 1.5767, "step": 6218 }, { "epoch": 0.27655506647103284, - "grad_norm": 0.09015826135873795, + "grad_norm": 0.09342074394226074, "learning_rate": 0.0009915290562252394, - "loss": 1.5886, + "loss": 1.5844, "step": 6220 }, { "epoch": 0.2766439909297052, - "grad_norm": 0.08987385034561157, + "grad_norm": 0.08680648356676102, "learning_rate": 0.0009915225822457897, - "loss": 1.5882, + "loss": 1.5792, "step": 6222 }, { "epoch": 0.2767329153883776, - "grad_norm": 0.08841314166784286, + "grad_norm": 0.09087889641523361, "learning_rate": 0.0009915161058145436, - "loss": 1.5761, + "loss": 1.5797, "step": 6224 }, { "epoch": 0.27682183984704994, - "grad_norm": 0.08853735774755478, + "grad_norm": 0.0886339470744133, "learning_rate": 0.0009915096269315336, - "loss": 1.5766, + "loss": 1.5726, "step": 6226 }, { "epoch": 0.2769107643057223, - "grad_norm": 0.09456057846546173, + "grad_norm": 0.08888717740774155, "learning_rate": 0.000991503145596792, - "loss": 1.574, + "loss": 1.5751, "step": 6228 }, { "epoch": 0.2769996887643946, - "grad_norm": 0.08918725699186325, + "grad_norm": 0.09028730541467667, "learning_rate": 0.0009914966618103512, - "loss": 1.58, + "loss": 1.581, "step": 6230 }, { "epoch": 0.277088613223067, - "grad_norm": 0.08875824511051178, + "grad_norm": 0.09174933284521103, "learning_rate": 0.0009914901755722434, - "loss": 1.5818, + "loss": 1.5828, "step": 6232 }, { "epoch": 0.27717753768173936, - "grad_norm": 0.08938588201999664, + "grad_norm": 0.09008409827947617, "learning_rate": 0.000991483686882501, - "loss": 1.5811, + "loss": 1.5725, "step": 6234 }, { "epoch": 0.27726646214041173, - "grad_norm": 0.09332944452762604, + "grad_norm": 0.09247057884931564, "learning_rate": 0.0009914771957411563, - "loss": 1.5737, + "loss": 1.584, "step": 6236 }, { "epoch": 0.2773553865990841, - "grad_norm": 0.09279682487249374, + "grad_norm": 0.09128382802009583, "learning_rate": 0.000991470702148242, - "loss": 1.5822, + "loss": 1.5849, "step": 6238 }, { "epoch": 0.2774443110577564, - "grad_norm": 0.09213843941688538, + "grad_norm": 0.09105904400348663, "learning_rate": 0.0009914642061037901, - "loss": 1.5816, + "loss": 1.5744, "step": 6240 }, { "epoch": 0.2775332355164288, - "grad_norm": 0.0885494276881218, + "grad_norm": 0.08844447880983353, "learning_rate": 0.0009914577076078333, - "loss": 1.5839, + "loss": 1.5689, "step": 6242 }, { "epoch": 0.27762215997510115, - "grad_norm": 0.08852691948413849, + "grad_norm": 0.08989200741052628, "learning_rate": 0.0009914512066604036, - "loss": 1.5798, + "loss": 1.5729, "step": 6244 }, { "epoch": 0.2777110844337735, - "grad_norm": 0.09214050322771072, + "grad_norm": 0.09029513597488403, "learning_rate": 0.0009914447032615338, - "loss": 1.5791, + "loss": 1.573, "step": 6246 }, { "epoch": 0.2778000088924459, - "grad_norm": 0.08975578099489212, + "grad_norm": 0.08931106328964233, "learning_rate": 0.0009914381974112565, - "loss": 1.5789, + "loss": 1.5832, "step": 6248 }, { "epoch": 0.27788893335111825, - "grad_norm": 0.08976390957832336, + "grad_norm": 0.08635377138853073, "learning_rate": 0.0009914316891096037, - "loss": 1.5823, + "loss": 1.5818, "step": 6250 }, { "epoch": 0.27797785780979056, - "grad_norm": 0.09317343682050705, + "grad_norm": 0.08634750545024872, "learning_rate": 0.0009914251783566081, - "loss": 1.5833, + "loss": 1.5733, "step": 6252 }, { "epoch": 0.27806678226846293, - "grad_norm": 0.09529649466276169, + "grad_norm": 0.09367714077234268, "learning_rate": 0.0009914186651523021, - "loss": 1.584, + "loss": 1.5813, "step": 6254 }, { "epoch": 0.2781557067271353, - "grad_norm": 0.09351009875535965, + "grad_norm": 0.08927787095308304, "learning_rate": 0.0009914121494967183, - "loss": 1.5768, + "loss": 1.5758, "step": 6256 }, { "epoch": 0.27824463118580767, - "grad_norm": 0.0957091674208641, + "grad_norm": 0.0868934839963913, "learning_rate": 0.000991405631389889, - "loss": 1.5817, + "loss": 1.5718, "step": 6258 }, { "epoch": 0.27833355564448004, - "grad_norm": 0.09383245557546616, + "grad_norm": 0.09208554029464722, "learning_rate": 0.000991399110831847, - "loss": 1.5804, + "loss": 1.5722, "step": 6260 }, { "epoch": 0.27842248010315235, - "grad_norm": 0.09093508869409561, + "grad_norm": 0.09598352760076523, "learning_rate": 0.0009913925878226246, - "loss": 1.5791, + "loss": 1.5769, "step": 6262 }, { "epoch": 0.2785114045618247, - "grad_norm": 0.08917820453643799, + "grad_norm": 0.09622900187969208, "learning_rate": 0.0009913860623622544, - "loss": 1.5798, + "loss": 1.5752, "step": 6264 }, { "epoch": 0.2786003290204971, - "grad_norm": 0.08865681290626526, + "grad_norm": 0.09845644980669022, "learning_rate": 0.0009913795344507687, - "loss": 1.5735, + "loss": 1.5793, "step": 6266 }, { "epoch": 0.27868925347916945, - "grad_norm": 0.08903191238641739, + "grad_norm": 0.08853481709957123, "learning_rate": 0.0009913730040882007, - "loss": 1.577, + "loss": 1.5745, "step": 6268 }, { "epoch": 0.2787781779378418, - "grad_norm": 0.0903497263789177, + "grad_norm": 0.09148851037025452, "learning_rate": 0.0009913664712745825, - "loss": 1.5788, + "loss": 1.5824, "step": 6270 }, { "epoch": 0.2788671023965142, - "grad_norm": 0.09220908582210541, + "grad_norm": 0.09253410995006561, "learning_rate": 0.0009913599360099469, - "loss": 1.577, + "loss": 1.5755, "step": 6272 }, { "epoch": 0.2789560268551865, - "grad_norm": 0.08878740668296814, + "grad_norm": 0.09554262459278107, "learning_rate": 0.000991353398294326, - "loss": 1.5652, + "loss": 1.5737, "step": 6274 }, { "epoch": 0.27904495131385887, - "grad_norm": 0.08956126868724823, + "grad_norm": 0.09318879246711731, "learning_rate": 0.000991346858127753, - "loss": 1.5773, + "loss": 1.5677, "step": 6276 }, { "epoch": 0.27913387577253124, - "grad_norm": 0.08757670223712921, + "grad_norm": 0.09248467534780502, "learning_rate": 0.0009913403155102603, - "loss": 1.5765, + "loss": 1.5769, "step": 6278 }, { "epoch": 0.2792228002312036, - "grad_norm": 0.09157538414001465, + "grad_norm": 0.09165968000888824, "learning_rate": 0.0009913337704418805, - "loss": 1.5868, + "loss": 1.5741, "step": 6280 }, { "epoch": 0.279311724689876, - "grad_norm": 0.08848825842142105, + "grad_norm": 0.09123395383358002, "learning_rate": 0.0009913272229226465, - "loss": 1.5768, + "loss": 1.5758, "step": 6282 }, { "epoch": 0.2794006491485483, - "grad_norm": 0.08986431360244751, + "grad_norm": 0.08967551589012146, "learning_rate": 0.0009913206729525905, - "loss": 1.5832, + "loss": 1.5773, "step": 6284 }, { "epoch": 0.27948957360722065, - "grad_norm": 0.09068012982606888, + "grad_norm": 0.08778689056634903, "learning_rate": 0.0009913141205317455, - "loss": 1.5813, + "loss": 1.5767, "step": 6286 }, { "epoch": 0.279578498065893, - "grad_norm": 0.09753326326608658, + "grad_norm": 0.084915891289711, "learning_rate": 0.0009913075656601442, - "loss": 1.5766, + "loss": 1.5787, "step": 6288 }, { "epoch": 0.2796674225245654, - "grad_norm": 0.09468485414981842, + "grad_norm": 0.0918944925069809, "learning_rate": 0.0009913010083378191, - "loss": 1.582, + "loss": 1.5747, "step": 6290 }, { "epoch": 0.27975634698323776, - "grad_norm": 0.08822248131036758, + "grad_norm": 0.09487152099609375, "learning_rate": 0.0009912944485648031, - "loss": 1.5779, + "loss": 1.5794, "step": 6292 }, { "epoch": 0.27984527144191007, - "grad_norm": 0.08877654373645782, + "grad_norm": 0.08974961936473846, "learning_rate": 0.000991287886341129, - "loss": 1.5761, + "loss": 1.5725, "step": 6294 }, { "epoch": 0.27993419590058244, - "grad_norm": 0.09508965164422989, + "grad_norm": 0.09361551702022552, "learning_rate": 0.0009912813216668291, - "loss": 1.5785, + "loss": 1.5786, "step": 6296 }, { "epoch": 0.2800231203592548, - "grad_norm": 0.09204161167144775, + "grad_norm": 0.08920656889677048, "learning_rate": 0.0009912747545419364, - "loss": 1.5762, + "loss": 1.581, "step": 6298 }, { "epoch": 0.2801120448179272, - "grad_norm": 0.08800431340932846, + "grad_norm": 0.08863282203674316, "learning_rate": 0.0009912681849664838, - "loss": 1.574, + "loss": 1.5788, "step": 6300 }, { "epoch": 0.28020096927659954, - "grad_norm": 0.08633691817522049, + "grad_norm": 0.08976864069700241, "learning_rate": 0.000991261612940504, - "loss": 1.5831, + "loss": 1.5735, "step": 6302 }, { "epoch": 0.2802898937352719, - "grad_norm": 0.09139908850193024, + "grad_norm": 0.08996647596359253, "learning_rate": 0.0009912550384640296, - "loss": 1.5789, + "loss": 1.5788, "step": 6304 }, { "epoch": 0.2803788181939442, - "grad_norm": 0.08665833622217178, + "grad_norm": 0.09197447448968887, "learning_rate": 0.0009912484615370936, - "loss": 1.5813, + "loss": 1.5751, "step": 6306 }, { "epoch": 0.2804677426526166, - "grad_norm": 0.09282457828521729, + "grad_norm": 0.09102824330329895, "learning_rate": 0.0009912418821597289, - "loss": 1.5779, + "loss": 1.5789, "step": 6308 }, { "epoch": 0.28055666711128896, - "grad_norm": 0.09250202775001526, + "grad_norm": 0.09098983556032181, "learning_rate": 0.0009912353003319679, - "loss": 1.5794, + "loss": 1.5816, "step": 6310 }, { "epoch": 0.28064559156996133, - "grad_norm": 0.08937076479196548, + "grad_norm": 0.0941241905093193, "learning_rate": 0.0009912287160538436, - "loss": 1.5778, + "loss": 1.5775, "step": 6312 }, { "epoch": 0.2807345160286337, - "grad_norm": 0.09536420553922653, + "grad_norm": 0.09087178856134415, "learning_rate": 0.000991222129325389, - "loss": 1.5753, + "loss": 1.5723, "step": 6314 }, { "epoch": 0.280823440487306, - "grad_norm": 0.09278502315282822, + "grad_norm": 0.08986848592758179, "learning_rate": 0.0009912155401466372, - "loss": 1.5744, + "loss": 1.5755, "step": 6316 }, { "epoch": 0.2809123649459784, - "grad_norm": 0.0898163914680481, + "grad_norm": 0.0928303524851799, "learning_rate": 0.0009912089485176203, - "loss": 1.5764, + "loss": 1.5737, "step": 6318 }, { "epoch": 0.28100128940465074, - "grad_norm": 0.09026706218719482, + "grad_norm": 0.09010421484708786, "learning_rate": 0.000991202354438372, - "loss": 1.5763, + "loss": 1.5804, "step": 6320 }, { "epoch": 0.2810902138633231, - "grad_norm": 0.09494885057210922, + "grad_norm": 0.09154298156499863, "learning_rate": 0.0009911957579089245, - "loss": 1.577, + "loss": 1.5741, "step": 6322 }, { "epoch": 0.2811791383219955, - "grad_norm": 0.0927162617444992, + "grad_norm": 0.09349411725997925, "learning_rate": 0.0009911891589293113, - "loss": 1.575, + "loss": 1.571, "step": 6324 }, { "epoch": 0.28126806278066785, - "grad_norm": 0.09039205312728882, + "grad_norm": 0.08908021450042725, "learning_rate": 0.000991182557499565, - "loss": 1.5816, + "loss": 1.5772, "step": 6326 }, { "epoch": 0.28135698723934016, - "grad_norm": 0.0965559259057045, + "grad_norm": 0.09845002740621567, "learning_rate": 0.0009911759536197183, - "loss": 1.5758, + "loss": 1.5778, "step": 6328 }, { "epoch": 0.28144591169801253, - "grad_norm": 0.10055029392242432, + "grad_norm": 0.08850311487913132, "learning_rate": 0.0009911693472898046, - "loss": 1.5779, + "loss": 1.5749, "step": 6330 }, { "epoch": 0.2815348361566849, - "grad_norm": 0.09373793005943298, + "grad_norm": 0.09225910902023315, "learning_rate": 0.0009911627385098566, - "loss": 1.5715, + "loss": 1.577, "step": 6332 }, { "epoch": 0.28162376061535727, - "grad_norm": 0.08934628963470459, + "grad_norm": 0.09028570353984833, "learning_rate": 0.0009911561272799075, - "loss": 1.5766, + "loss": 1.5768, "step": 6334 }, { "epoch": 0.28171268507402963, - "grad_norm": 0.09276317059993744, + "grad_norm": 0.08955962210893631, "learning_rate": 0.00099114951359999, - "loss": 1.5804, + "loss": 1.5769, "step": 6336 }, { "epoch": 0.28180160953270195, - "grad_norm": 0.09163730591535568, + "grad_norm": 0.08538378030061722, "learning_rate": 0.0009911428974701372, - "loss": 1.5706, + "loss": 1.5768, "step": 6338 }, { "epoch": 0.2818905339913743, - "grad_norm": 0.09011133015155792, + "grad_norm": 0.08776732534170151, "learning_rate": 0.0009911362788903822, - "loss": 1.5778, + "loss": 1.5755, "step": 6340 }, { "epoch": 0.2819794584500467, - "grad_norm": 0.09156705439090729, + "grad_norm": 0.08690127730369568, "learning_rate": 0.0009911296578607577, - "loss": 1.5809, + "loss": 1.5728, "step": 6342 }, { "epoch": 0.28206838290871905, - "grad_norm": 0.0904756560921669, + "grad_norm": 0.08759032934904099, "learning_rate": 0.000991123034381297, - "loss": 1.573, + "loss": 1.5728, "step": 6344 }, { "epoch": 0.2821573073673914, - "grad_norm": 0.09244727343320847, + "grad_norm": 0.08740068972110748, "learning_rate": 0.0009911164084520333, - "loss": 1.5827, + "loss": 1.5709, "step": 6346 }, { "epoch": 0.28224623182606373, - "grad_norm": 0.0875893235206604, + "grad_norm": 0.09269807487726212, "learning_rate": 0.0009911097800729993, - "loss": 1.58, + "loss": 1.5711, "step": 6348 }, { "epoch": 0.2823351562847361, - "grad_norm": 0.08677124977111816, + "grad_norm": 0.08954404294490814, "learning_rate": 0.0009911031492442282, - "loss": 1.5769, + "loss": 1.5742, "step": 6350 }, { "epoch": 0.28242408074340847, - "grad_norm": 0.08941702544689178, + "grad_norm": 0.08641083538532257, "learning_rate": 0.000991096515965753, - "loss": 1.5781, + "loss": 1.5766, "step": 6352 }, { "epoch": 0.28251300520208084, - "grad_norm": 0.08653330057859421, + "grad_norm": 0.0864805057644844, "learning_rate": 0.0009910898802376069, - "loss": 1.5807, + "loss": 1.5738, "step": 6354 }, { "epoch": 0.2826019296607532, - "grad_norm": 0.08980996161699295, + "grad_norm": 0.09081622213125229, "learning_rate": 0.000991083242059823, - "loss": 1.5729, + "loss": 1.5746, "step": 6356 }, { "epoch": 0.28269085411942557, - "grad_norm": 0.08928785473108292, + "grad_norm": 0.09132978320121765, "learning_rate": 0.0009910766014324345, - "loss": 1.5773, + "loss": 1.5765, "step": 6358 }, { "epoch": 0.2827797785780979, - "grad_norm": 0.09071324020624161, + "grad_norm": 0.09193006902933121, "learning_rate": 0.000991069958355474, - "loss": 1.5783, + "loss": 1.5782, "step": 6360 }, { "epoch": 0.28286870303677025, - "grad_norm": 0.09113699197769165, + "grad_norm": 0.09301292151212692, "learning_rate": 0.0009910633128289753, - "loss": 1.5743, + "loss": 1.5711, "step": 6362 }, { "epoch": 0.2829576274954426, - "grad_norm": 0.09183058887720108, + "grad_norm": 0.09332393854856491, "learning_rate": 0.0009910566648529712, - "loss": 1.5802, + "loss": 1.5724, "step": 6364 }, { "epoch": 0.283046551954115, - "grad_norm": 0.08970838785171509, + "grad_norm": 0.08995355665683746, "learning_rate": 0.000991050014427495, - "loss": 1.5798, + "loss": 1.5861, "step": 6366 }, { "epoch": 0.28313547641278736, - "grad_norm": 0.08981199562549591, + "grad_norm": 0.09018755704164505, "learning_rate": 0.00099104336155258, - "loss": 1.5731, + "loss": 1.5695, "step": 6368 }, { "epoch": 0.28322440087145967, - "grad_norm": 0.08866599947214127, + "grad_norm": 0.09240277856588364, "learning_rate": 0.000991036706228259, - "loss": 1.5759, + "loss": 1.5718, "step": 6370 }, { "epoch": 0.28331332533013204, - "grad_norm": 0.0883878767490387, + "grad_norm": 0.08703695982694626, "learning_rate": 0.0009910300484545654, - "loss": 1.5755, + "loss": 1.5753, "step": 6372 }, { "epoch": 0.2834022497888044, - "grad_norm": 0.09433773905038834, + "grad_norm": 0.09340234845876694, "learning_rate": 0.0009910233882315326, - "loss": 1.5788, + "loss": 1.578, "step": 6374 }, { "epoch": 0.2834911742474768, - "grad_norm": 0.09115243703126907, + "grad_norm": 0.09236439317464828, "learning_rate": 0.0009910167255591934, - "loss": 1.5792, + "loss": 1.5762, "step": 6376 }, { "epoch": 0.28358009870614914, - "grad_norm": 0.08943197876214981, + "grad_norm": 0.09320174157619476, "learning_rate": 0.0009910100604375812, - "loss": 1.5767, + "loss": 1.569, "step": 6378 }, { "epoch": 0.2836690231648215, - "grad_norm": 0.08791480213403702, + "grad_norm": 0.08888036757707596, "learning_rate": 0.0009910033928667294, - "loss": 1.5751, + "loss": 1.5729, "step": 6380 }, { "epoch": 0.2837579476234938, - "grad_norm": 0.08960872143507004, + "grad_norm": 0.0888710469007492, "learning_rate": 0.0009909967228466713, - "loss": 1.5751, + "loss": 1.5731, "step": 6382 }, { "epoch": 0.2838468720821662, - "grad_norm": 0.09118589013814926, + "grad_norm": 0.09013516455888748, "learning_rate": 0.00099099005037744, - "loss": 1.5812, + "loss": 1.5704, "step": 6384 }, { "epoch": 0.28393579654083856, - "grad_norm": 0.09271421283483505, + "grad_norm": 0.08720479905605316, "learning_rate": 0.0009909833754590688, - "loss": 1.5718, + "loss": 1.5738, "step": 6386 }, { "epoch": 0.2840247209995109, - "grad_norm": 0.08939196914434433, + "grad_norm": 0.08885452896356583, "learning_rate": 0.000990976698091591, - "loss": 1.5772, + "loss": 1.5726, "step": 6388 }, { "epoch": 0.2841136454581833, - "grad_norm": 0.0881928876042366, + "grad_norm": 0.09276578575372696, "learning_rate": 0.00099097001827504, - "loss": 1.5754, + "loss": 1.5756, "step": 6390 }, { "epoch": 0.2842025699168556, - "grad_norm": 0.09285976737737656, + "grad_norm": 0.08871392160654068, "learning_rate": 0.000990963336009449, - "loss": 1.5825, + "loss": 1.5747, "step": 6392 }, { "epoch": 0.284291494375528, - "grad_norm": 0.08723361045122147, + "grad_norm": 0.09305480867624283, "learning_rate": 0.0009909566512948513, - "loss": 1.5777, + "loss": 1.5691, "step": 6394 }, { "epoch": 0.28438041883420034, - "grad_norm": 0.09038031846284866, + "grad_norm": 0.08993328362703323, "learning_rate": 0.0009909499641312805, - "loss": 1.5756, + "loss": 1.5752, "step": 6396 }, { "epoch": 0.2844693432928727, - "grad_norm": 0.09389235079288483, + "grad_norm": 0.09120582789182663, "learning_rate": 0.0009909432745187697, - "loss": 1.5766, + "loss": 1.5774, "step": 6398 }, { "epoch": 0.2845582677515451, - "grad_norm": 0.08978234231472015, + "grad_norm": 0.08618117868900299, "learning_rate": 0.0009909365824573523, - "loss": 1.5761, + "loss": 1.575, "step": 6400 }, { "epoch": 0.28464719221021745, - "grad_norm": 0.08565577864646912, + "grad_norm": 0.08837292343378067, "learning_rate": 0.0009909298879470618, - "loss": 1.5758, + "loss": 1.5787, "step": 6402 }, { "epoch": 0.28473611666888976, - "grad_norm": 0.0905274748802185, + "grad_norm": 0.08570041507482529, "learning_rate": 0.0009909231909879315, - "loss": 1.577, + "loss": 1.5808, "step": 6404 }, { "epoch": 0.28482504112756213, - "grad_norm": 0.08732505887746811, + "grad_norm": 0.09008973836898804, "learning_rate": 0.000990916491579995, - "loss": 1.572, + "loss": 1.5763, "step": 6406 }, { "epoch": 0.2849139655862345, - "grad_norm": 0.08888770639896393, + "grad_norm": 0.09252497553825378, "learning_rate": 0.0009909097897232854, - "loss": 1.574, + "loss": 1.5776, "step": 6408 }, { "epoch": 0.28500289004490686, - "grad_norm": 0.0869973823428154, + "grad_norm": 0.08563211560249329, "learning_rate": 0.0009909030854178362, - "loss": 1.576, + "loss": 1.5674, "step": 6410 }, { "epoch": 0.28509181450357923, - "grad_norm": 0.0890926942229271, + "grad_norm": 0.09084684401750565, "learning_rate": 0.0009908963786636811, - "loss": 1.5796, + "loss": 1.5732, "step": 6412 }, { "epoch": 0.28518073896225155, - "grad_norm": 0.09170569479465485, + "grad_norm": 0.09400637447834015, "learning_rate": 0.0009908896694608533, - "loss": 1.5787, + "loss": 1.5712, "step": 6414 }, { "epoch": 0.2852696634209239, - "grad_norm": 0.08640869706869125, + "grad_norm": 0.08952626585960388, "learning_rate": 0.0009908829578093863, - "loss": 1.5736, + "loss": 1.5744, "step": 6416 }, { "epoch": 0.2853585878795963, - "grad_norm": 0.0850198045372963, + "grad_norm": 0.09080950170755386, "learning_rate": 0.0009908762437093137, - "loss": 1.5701, + "loss": 1.57, "step": 6418 }, { "epoch": 0.28544751233826865, - "grad_norm": 0.08946503698825836, + "grad_norm": 0.08886551856994629, "learning_rate": 0.000990869527160669, - "loss": 1.5734, + "loss": 1.5763, "step": 6420 }, { "epoch": 0.285536436796941, - "grad_norm": 0.0856601893901825, + "grad_norm": 0.09163545817136765, "learning_rate": 0.0009908628081634855, - "loss": 1.5725, + "loss": 1.577, "step": 6422 }, { "epoch": 0.28562536125561333, - "grad_norm": 0.08981139957904816, + "grad_norm": 0.08795285224914551, "learning_rate": 0.0009908560867177966, - "loss": 1.5771, + "loss": 1.5714, "step": 6424 }, { "epoch": 0.2857142857142857, - "grad_norm": 0.09108799695968628, + "grad_norm": 0.08879886567592621, "learning_rate": 0.0009908493628236364, - "loss": 1.573, + "loss": 1.5738, "step": 6426 }, { "epoch": 0.28580321017295807, - "grad_norm": 0.088833287358284, + "grad_norm": 0.09103137999773026, "learning_rate": 0.000990842636481038, - "loss": 1.5747, + "loss": 1.5725, "step": 6428 }, { "epoch": 0.28589213463163043, - "grad_norm": 0.0866592600941658, + "grad_norm": 0.08968726545572281, "learning_rate": 0.0009908359076900351, - "loss": 1.576, + "loss": 1.5728, "step": 6430 }, { "epoch": 0.2859810590903028, - "grad_norm": 0.09101054817438126, + "grad_norm": 0.08867494016885757, "learning_rate": 0.000990829176450661, - "loss": 1.5765, + "loss": 1.5695, "step": 6432 }, { "epoch": 0.28606998354897517, - "grad_norm": 0.08800952136516571, + "grad_norm": 0.08909203112125397, "learning_rate": 0.0009908224427629497, - "loss": 1.5737, + "loss": 1.5785, "step": 6434 }, { "epoch": 0.2861589080076475, - "grad_norm": 0.08939360082149506, + "grad_norm": 0.08877209573984146, "learning_rate": 0.0009908157066269346, - "loss": 1.5797, + "loss": 1.5675, "step": 6436 }, { "epoch": 0.28624783246631985, - "grad_norm": 0.08347197622060776, + "grad_norm": 0.08952686190605164, "learning_rate": 0.0009908089680426492, - "loss": 1.5755, + "loss": 1.5712, "step": 6438 }, { "epoch": 0.2863367569249922, - "grad_norm": 0.08396611362695694, + "grad_norm": 0.08946753293275833, "learning_rate": 0.0009908022270101273, - "loss": 1.5704, + "loss": 1.5706, "step": 6440 }, { "epoch": 0.2864256813836646, - "grad_norm": 0.08842881768941879, + "grad_norm": 0.08814926445484161, "learning_rate": 0.0009907954835294023, - "loss": 1.5711, + "loss": 1.5738, "step": 6442 }, { "epoch": 0.28651460584233696, - "grad_norm": 0.08789164572954178, + "grad_norm": 0.09422788769006729, "learning_rate": 0.000990788737600508, - "loss": 1.5753, + "loss": 1.5799, "step": 6444 }, { "epoch": 0.28660353030100927, - "grad_norm": 0.0914948359131813, + "grad_norm": 0.09289488941431046, "learning_rate": 0.0009907819892234781, - "loss": 1.5776, + "loss": 1.5761, "step": 6446 }, { "epoch": 0.28669245475968164, - "grad_norm": 0.09023447334766388, + "grad_norm": 0.08914397656917572, "learning_rate": 0.0009907752383983461, - "loss": 1.5746, + "loss": 1.569, "step": 6448 }, { "epoch": 0.286781379218354, - "grad_norm": 0.08644402027130127, + "grad_norm": 0.08954908698797226, "learning_rate": 0.0009907684851251457, - "loss": 1.5828, + "loss": 1.5802, "step": 6450 }, { "epoch": 0.2868703036770264, - "grad_norm": 0.09444896876811981, + "grad_norm": 0.09144933521747589, "learning_rate": 0.0009907617294039107, - "loss": 1.5749, + "loss": 1.5758, "step": 6452 }, { "epoch": 0.28695922813569874, - "grad_norm": 0.09218712151050568, + "grad_norm": 0.09079254418611526, "learning_rate": 0.0009907549712346747, - "loss": 1.5747, + "loss": 1.5721, "step": 6454 }, { "epoch": 0.2870481525943711, - "grad_norm": 0.09216786921024323, + "grad_norm": 0.08957768976688385, "learning_rate": 0.0009907482106174714, - "loss": 1.5686, + "loss": 1.5687, "step": 6456 }, { "epoch": 0.2871370770530434, - "grad_norm": 0.09627875685691833, + "grad_norm": 0.09010159969329834, "learning_rate": 0.0009907414475523347, - "loss": 1.5795, + "loss": 1.5772, "step": 6458 }, { "epoch": 0.2872260015117158, - "grad_norm": 0.0945919081568718, + "grad_norm": 0.09015689790248871, "learning_rate": 0.000990734682039298, - "loss": 1.5775, + "loss": 1.57, "step": 6460 }, { "epoch": 0.28731492597038816, - "grad_norm": 0.09069854766130447, + "grad_norm": 0.0964498221874237, "learning_rate": 0.0009907279140783953, - "loss": 1.5717, + "loss": 1.5716, "step": 6462 }, { "epoch": 0.2874038504290605, - "grad_norm": 0.0864575058221817, + "grad_norm": 0.09194839745759964, "learning_rate": 0.0009907211436696603, - "loss": 1.5729, + "loss": 1.5666, "step": 6464 }, { "epoch": 0.2874927748877329, - "grad_norm": 0.08474697917699814, + "grad_norm": 0.0931999608874321, "learning_rate": 0.000990714370813127, - "loss": 1.5797, + "loss": 1.5766, "step": 6466 }, { "epoch": 0.2875816993464052, - "grad_norm": 0.09655170887708664, + "grad_norm": 0.08518645912408829, "learning_rate": 0.0009907075955088286, - "loss": 1.5755, + "loss": 1.5778, "step": 6468 }, { "epoch": 0.2876706238050776, - "grad_norm": 0.0950208306312561, + "grad_norm": 0.08883948624134064, "learning_rate": 0.0009907008177567997, - "loss": 1.5822, + "loss": 1.5709, "step": 6470 }, { "epoch": 0.28775954826374994, - "grad_norm": 0.09422527253627777, + "grad_norm": 0.08934309333562851, "learning_rate": 0.0009906940375570734, - "loss": 1.572, + "loss": 1.5703, "step": 6472 }, { "epoch": 0.2878484727224223, - "grad_norm": 0.09860546141862869, + "grad_norm": 0.09022992104291916, "learning_rate": 0.0009906872549096837, - "loss": 1.5746, + "loss": 1.572, "step": 6474 }, { "epoch": 0.2879373971810947, - "grad_norm": 0.09196434170007706, + "grad_norm": 0.09257543832063675, "learning_rate": 0.0009906804698146647, - "loss": 1.5766, + "loss": 1.5726, "step": 6476 }, { "epoch": 0.288026321639767, - "grad_norm": 0.09196584671735764, + "grad_norm": 0.09016820788383484, "learning_rate": 0.0009906736822720501, - "loss": 1.5778, + "loss": 1.5735, "step": 6478 }, { "epoch": 0.28811524609843936, - "grad_norm": 0.09416534006595612, + "grad_norm": 0.08995562046766281, "learning_rate": 0.0009906668922818735, - "loss": 1.5727, + "loss": 1.5661, "step": 6480 }, { "epoch": 0.2882041705571117, - "grad_norm": 0.08899632841348648, + "grad_norm": 0.09062064439058304, "learning_rate": 0.0009906600998441694, - "loss": 1.5762, + "loss": 1.5656, "step": 6482 }, { "epoch": 0.2882930950157841, - "grad_norm": 0.09077105671167374, + "grad_norm": 0.08658911287784576, "learning_rate": 0.0009906533049589708, - "loss": 1.5743, + "loss": 1.5682, "step": 6484 }, { "epoch": 0.28838201947445646, - "grad_norm": 0.08790810406208038, + "grad_norm": 0.08712375164031982, "learning_rate": 0.0009906465076263123, - "loss": 1.5757, + "loss": 1.5734, "step": 6486 }, { "epoch": 0.28847094393312883, - "grad_norm": 0.08573640882968903, + "grad_norm": 0.0906602293252945, "learning_rate": 0.0009906397078462276, - "loss": 1.5705, + "loss": 1.5742, "step": 6488 }, { "epoch": 0.28855986839180114, - "grad_norm": 0.08646200597286224, + "grad_norm": 0.08672540634870529, "learning_rate": 0.0009906329056187505, - "loss": 1.5771, + "loss": 1.5789, "step": 6490 }, { "epoch": 0.2886487928504735, - "grad_norm": 0.08822999894618988, + "grad_norm": 0.08817194402217865, "learning_rate": 0.0009906261009439152, - "loss": 1.5845, + "loss": 1.5728, "step": 6492 }, { "epoch": 0.2887377173091459, - "grad_norm": 0.08835070580244064, + "grad_norm": 0.09028277546167374, "learning_rate": 0.0009906192938217552, - "loss": 1.5755, + "loss": 1.5735, "step": 6494 }, { "epoch": 0.28882664176781825, - "grad_norm": 0.08538537472486496, + "grad_norm": 0.09235137701034546, "learning_rate": 0.000990612484252305, - "loss": 1.5786, + "loss": 1.5749, "step": 6496 }, { "epoch": 0.2889155662264906, - "grad_norm": 0.08954775333404541, + "grad_norm": 0.09245535731315613, "learning_rate": 0.000990605672235598, - "loss": 1.5816, + "loss": 1.5751, "step": 6498 }, { "epoch": 0.28900449068516293, - "grad_norm": 0.08904875814914703, + "grad_norm": 0.08902133256196976, "learning_rate": 0.0009905988577716687, - "loss": 1.579, + "loss": 1.5744, "step": 6500 }, { "epoch": 0.28900449068516293, - "eval_loss": 1.5529825687408447, - "eval_runtime": 12.3674, - "eval_samples_per_second": 558.726, - "eval_steps_per_second": 69.861, + "eval_loss": 1.5468125343322754, + "eval_runtime": 12.4163, + "eval_samples_per_second": 556.526, + "eval_steps_per_second": 69.586, "step": 6500 }, { "epoch": 0.2890934151438353, - "grad_norm": 0.08835111558437347, + "grad_norm": 0.09113017469644547, "learning_rate": 0.0009905920408605508, - "loss": 1.5768, + "loss": 1.5807, "step": 6502 }, { "epoch": 0.28918233960250767, - "grad_norm": 0.08942531794309616, + "grad_norm": 0.09066907316446304, "learning_rate": 0.0009905852215022783, - "loss": 1.5733, + "loss": 1.5756, "step": 6504 }, { "epoch": 0.28927126406118003, - "grad_norm": 0.08983755856752396, + "grad_norm": 0.09358642995357513, "learning_rate": 0.0009905783996968854, - "loss": 1.5754, + "loss": 1.5779, "step": 6506 }, { "epoch": 0.2893601885198524, - "grad_norm": 0.08812680840492249, + "grad_norm": 0.08834748715162277, "learning_rate": 0.000990571575444406, - "loss": 1.5752, + "loss": 1.5675, "step": 6508 }, { "epoch": 0.28944911297852477, - "grad_norm": 0.08993680775165558, + "grad_norm": 0.08833714574575424, "learning_rate": 0.000990564748744874, - "loss": 1.5683, + "loss": 1.5699, "step": 6510 }, { "epoch": 0.2895380374371971, - "grad_norm": 0.08852921426296234, + "grad_norm": 0.09438148140907288, "learning_rate": 0.000990557919598324, - "loss": 1.5779, + "loss": 1.5703, "step": 6512 }, { "epoch": 0.28962696189586945, - "grad_norm": 0.0910344123840332, + "grad_norm": 0.08776549994945526, "learning_rate": 0.000990551088004789, - "loss": 1.5702, + "loss": 1.5659, "step": 6514 }, { "epoch": 0.2897158863545418, - "grad_norm": 0.09157989919185638, + "grad_norm": 0.09066025912761688, "learning_rate": 0.0009905442539643043, - "loss": 1.5793, + "loss": 1.5698, "step": 6516 }, { "epoch": 0.2898048108132142, - "grad_norm": 0.08516612648963928, + "grad_norm": 0.08665291219949722, "learning_rate": 0.0009905374174769032, - "loss": 1.5806, + "loss": 1.5707, "step": 6518 }, { "epoch": 0.28989373527188655, - "grad_norm": 0.08848042041063309, + "grad_norm": 0.08850061148405075, "learning_rate": 0.0009905305785426202, - "loss": 1.5723, + "loss": 1.5764, "step": 6520 }, { "epoch": 0.28998265973055887, - "grad_norm": 0.09307534992694855, + "grad_norm": 0.09048718959093094, "learning_rate": 0.000990523737161489, - "loss": 1.5761, + "loss": 1.5785, "step": 6522 }, { "epoch": 0.29007158418923124, - "grad_norm": 0.09274153411388397, + "grad_norm": 0.08610177785158157, "learning_rate": 0.0009905168933335442, - "loss": 1.5753, + "loss": 1.5681, "step": 6524 }, { "epoch": 0.2901605086479036, - "grad_norm": 0.0851801261305809, + "grad_norm": 0.08931626379489899, "learning_rate": 0.0009905100470588197, - "loss": 1.5718, + "loss": 1.5733, "step": 6526 }, { "epoch": 0.29024943310657597, - "grad_norm": 0.08987370133399963, + "grad_norm": 0.08849899470806122, "learning_rate": 0.0009905031983373496, - "loss": 1.5804, + "loss": 1.5729, "step": 6528 }, { "epoch": 0.29033835756524834, - "grad_norm": 0.09645106643438339, + "grad_norm": 0.08723746240139008, "learning_rate": 0.0009904963471691682, - "loss": 1.5734, + "loss": 1.5738, "step": 6530 }, { "epoch": 0.29042728202392065, - "grad_norm": 0.09304139763116837, + "grad_norm": 0.08764983713626862, "learning_rate": 0.0009904894935543094, - "loss": 1.5705, + "loss": 1.5785, "step": 6532 }, { "epoch": 0.290516206482593, - "grad_norm": 0.09060996025800705, + "grad_norm": 0.09550359100103378, "learning_rate": 0.0009904826374928077, - "loss": 1.5647, + "loss": 1.5718, "step": 6534 }, { "epoch": 0.2906051309412654, - "grad_norm": 0.08674405515193939, + "grad_norm": 0.09002628922462463, "learning_rate": 0.0009904757789846973, - "loss": 1.5705, + "loss": 1.5796, "step": 6536 }, { "epoch": 0.29069405539993776, - "grad_norm": 0.0911547839641571, + "grad_norm": 0.09034377336502075, "learning_rate": 0.0009904689180300122, - "loss": 1.5728, + "loss": 1.5693, "step": 6538 }, { "epoch": 0.2907829798586101, - "grad_norm": 0.09550792723894119, + "grad_norm": 0.09026948362588882, "learning_rate": 0.0009904620546287866, - "loss": 1.5784, + "loss": 1.578, "step": 6540 }, { "epoch": 0.2908719043172825, - "grad_norm": 0.09049154818058014, + "grad_norm": 0.09430602192878723, "learning_rate": 0.0009904551887810551, - "loss": 1.5695, + "loss": 1.574, "step": 6542 }, { "epoch": 0.2909608287759548, - "grad_norm": 0.08971771597862244, + "grad_norm": 0.08742611855268478, "learning_rate": 0.0009904483204868516, - "loss": 1.5712, + "loss": 1.5757, "step": 6544 }, { "epoch": 0.2910497532346272, - "grad_norm": 0.09300079941749573, + "grad_norm": 0.08668093383312225, "learning_rate": 0.0009904414497462104, - "loss": 1.5732, + "loss": 1.5713, "step": 6546 }, { "epoch": 0.29113867769329954, - "grad_norm": 0.0910913348197937, + "grad_norm": 0.0894765630364418, "learning_rate": 0.000990434576559166, - "loss": 1.5708, + "loss": 1.5737, "step": 6548 }, { "epoch": 0.2912276021519719, - "grad_norm": 0.09235113114118576, + "grad_norm": 0.09396469593048096, "learning_rate": 0.0009904277009257524, - "loss": 1.5734, + "loss": 1.5765, "step": 6550 }, { "epoch": 0.2913165266106443, - "grad_norm": 0.08843281120061874, + "grad_norm": 0.08750537782907486, "learning_rate": 0.0009904208228460041, - "loss": 1.5728, + "loss": 1.5662, "step": 6552 }, { "epoch": 0.2914054510693166, - "grad_norm": 0.09289004653692245, + "grad_norm": 0.08443306386470795, "learning_rate": 0.0009904139423199552, - "loss": 1.5732, + "loss": 1.5776, "step": 6554 }, { "epoch": 0.29149437552798896, - "grad_norm": 0.09066592901945114, + "grad_norm": 0.08997094631195068, "learning_rate": 0.00099040705934764, - "loss": 1.5765, + "loss": 1.5678, "step": 6556 }, { "epoch": 0.2915832999866613, - "grad_norm": 0.0933317095041275, + "grad_norm": 0.08915294706821442, "learning_rate": 0.0009904001739290934, - "loss": 1.5769, + "loss": 1.5724, "step": 6558 }, { "epoch": 0.2916722244453337, - "grad_norm": 0.08912403881549835, + "grad_norm": 0.08931171149015427, "learning_rate": 0.0009903932860643493, - "loss": 1.5727, + "loss": 1.5644, "step": 6560 }, { "epoch": 0.29176114890400606, - "grad_norm": 0.09174713492393494, + "grad_norm": 0.09019617736339569, "learning_rate": 0.0009903863957534417, - "loss": 1.5759, + "loss": 1.5755, "step": 6562 }, { "epoch": 0.29185007336267843, - "grad_norm": 0.09204263240098953, + "grad_norm": 0.08960139006376266, "learning_rate": 0.0009903795029964058, - "loss": 1.5651, + "loss": 1.5696, "step": 6564 }, { "epoch": 0.29193899782135074, - "grad_norm": 0.08766184002161026, + "grad_norm": 0.08686326444149017, "learning_rate": 0.0009903726077932753, - "loss": 1.5673, + "loss": 1.5728, "step": 6566 }, { "epoch": 0.2920279222800231, - "grad_norm": 0.08856314420700073, + "grad_norm": 0.08807472884654999, "learning_rate": 0.0009903657101440848, - "loss": 1.5653, + "loss": 1.5748, "step": 6568 }, { "epoch": 0.2921168467386955, - "grad_norm": 0.09339053183794022, + "grad_norm": 0.0936129167675972, "learning_rate": 0.0009903588100488687, - "loss": 1.572, + "loss": 1.5737, "step": 6570 }, { "epoch": 0.29220577119736785, - "grad_norm": 0.08935055881738663, + "grad_norm": 0.09177185595035553, "learning_rate": 0.0009903519075076617, - "loss": 1.5685, + "loss": 1.5696, "step": 6572 }, { "epoch": 0.2922946956560402, - "grad_norm": 0.09450680762529373, + "grad_norm": 0.08630893379449844, "learning_rate": 0.0009903450025204978, - "loss": 1.5738, + "loss": 1.5701, "step": 6574 }, { "epoch": 0.29238362011471253, - "grad_norm": 0.0896066278219223, + "grad_norm": 0.08818253129720688, "learning_rate": 0.0009903380950874116, - "loss": 1.5808, + "loss": 1.5675, "step": 6576 }, { "epoch": 0.2924725445733849, - "grad_norm": 0.09094058722257614, + "grad_norm": 0.08376295864582062, "learning_rate": 0.0009903311852084379, - "loss": 1.5769, + "loss": 1.5677, "step": 6578 }, { "epoch": 0.29256146903205726, - "grad_norm": 0.0920943170785904, + "grad_norm": 0.08859193325042725, "learning_rate": 0.0009903242728836106, - "loss": 1.5711, + "loss": 1.5702, "step": 6580 }, { "epoch": 0.29265039349072963, - "grad_norm": 0.08988334238529205, + "grad_norm": 0.0904025286436081, "learning_rate": 0.0009903173581129645, - "loss": 1.5768, + "loss": 1.5719, "step": 6582 }, { "epoch": 0.292739317949402, - "grad_norm": 0.08709396421909332, + "grad_norm": 0.08744067698717117, "learning_rate": 0.0009903104408965338, - "loss": 1.5719, + "loss": 1.57, "step": 6584 }, { "epoch": 0.29282824240807437, - "grad_norm": 0.08830174803733826, + "grad_norm": 0.08911757916212082, "learning_rate": 0.0009903035212343535, - "loss": 1.571, + "loss": 1.5734, "step": 6586 }, { "epoch": 0.2929171668667467, - "grad_norm": 0.09119716286659241, + "grad_norm": 0.0855589210987091, "learning_rate": 0.000990296599126458, - "loss": 1.58, + "loss": 1.5706, "step": 6588 }, { "epoch": 0.29300609132541905, - "grad_norm": 0.08716332912445068, + "grad_norm": 0.0880928561091423, "learning_rate": 0.0009902896745728814, - "loss": 1.5741, + "loss": 1.5733, "step": 6590 }, { "epoch": 0.2930950157840914, - "grad_norm": 0.08740916103124619, + "grad_norm": 0.08391828835010529, "learning_rate": 0.0009902827475736585, - "loss": 1.5765, + "loss": 1.5713, "step": 6592 }, { "epoch": 0.2931839402427638, - "grad_norm": 0.09080121666193008, + "grad_norm": 0.08916763961315155, "learning_rate": 0.000990275818128824, - "loss": 1.567, + "loss": 1.575, "step": 6594 }, { "epoch": 0.29327286470143615, - "grad_norm": 0.0871514305472374, + "grad_norm": 0.08984120190143585, "learning_rate": 0.0009902688862384122, - "loss": 1.5767, + "loss": 1.5738, "step": 6596 }, { "epoch": 0.29336178916010847, - "grad_norm": 0.08797577768564224, + "grad_norm": 0.08982984721660614, "learning_rate": 0.000990261951902458, - "loss": 1.5733, + "loss": 1.5709, "step": 6598 }, { "epoch": 0.29345071361878083, - "grad_norm": 0.09561526775360107, + "grad_norm": 0.08733770996332169, "learning_rate": 0.0009902550151209956, - "loss": 1.5747, + "loss": 1.5711, "step": 6600 }, { "epoch": 0.2935396380774532, - "grad_norm": 0.0883723720908165, + "grad_norm": 0.08438307046890259, "learning_rate": 0.00099024807589406, - "loss": 1.5737, + "loss": 1.5683, "step": 6602 }, { "epoch": 0.29362856253612557, - "grad_norm": 0.08966512233018875, + "grad_norm": 0.08910389244556427, "learning_rate": 0.0009902411342216854, - "loss": 1.5774, + "loss": 1.5732, "step": 6604 }, { "epoch": 0.29371748699479794, - "grad_norm": 0.09019225090742111, + "grad_norm": 0.09192346036434174, "learning_rate": 0.000990234190103907, - "loss": 1.5751, + "loss": 1.5696, "step": 6606 }, { "epoch": 0.29380641145347025, - "grad_norm": 0.08985605090856552, + "grad_norm": 0.0915895402431488, "learning_rate": 0.0009902272435407587, - "loss": 1.5752, + "loss": 1.5671, "step": 6608 }, { "epoch": 0.2938953359121426, - "grad_norm": 0.08699138462543488, + "grad_norm": 0.09286682307720184, "learning_rate": 0.0009902202945322757, - "loss": 1.5762, + "loss": 1.5757, "step": 6610 }, { "epoch": 0.293984260370815, - "grad_norm": 0.0910281166434288, + "grad_norm": 0.0949014201760292, "learning_rate": 0.0009902133430784925, - "loss": 1.5751, + "loss": 1.5736, "step": 6612 }, { "epoch": 0.29407318482948736, - "grad_norm": 0.09535197913646698, + "grad_norm": 0.09041435271501541, "learning_rate": 0.0009902063891794439, - "loss": 1.5746, + "loss": 1.5739, "step": 6614 }, { "epoch": 0.2941621092881597, - "grad_norm": 0.08794602751731873, + "grad_norm": 0.09175708889961243, "learning_rate": 0.0009901994328351642, - "loss": 1.5731, + "loss": 1.5704, "step": 6616 }, { "epoch": 0.2942510337468321, - "grad_norm": 0.09471576660871506, + "grad_norm": 0.09223156422376633, "learning_rate": 0.0009901924740456885, - "loss": 1.5743, + "loss": 1.5742, "step": 6618 }, { "epoch": 0.2943399582055044, - "grad_norm": 0.09367300570011139, + "grad_norm": 0.08808215707540512, "learning_rate": 0.0009901855128110514, - "loss": 1.5724, + "loss": 1.5751, "step": 6620 }, { "epoch": 0.29442888266417677, - "grad_norm": 0.08670418709516525, + "grad_norm": 0.08614702522754669, "learning_rate": 0.0009901785491312875, - "loss": 1.5725, + "loss": 1.5766, "step": 6622 }, { "epoch": 0.29451780712284914, - "grad_norm": 0.0896240845322609, + "grad_norm": 0.08723152428865433, "learning_rate": 0.0009901715830064317, - "loss": 1.5773, + "loss": 1.5684, "step": 6624 }, { "epoch": 0.2946067315815215, - "grad_norm": 0.08773031085729599, + "grad_norm": 0.08894877880811691, "learning_rate": 0.0009901646144365188, - "loss": 1.5765, + "loss": 1.5734, "step": 6626 }, { "epoch": 0.2946956560401939, - "grad_norm": 0.08660749346017838, + "grad_norm": 0.09144090861082077, "learning_rate": 0.0009901576434215833, - "loss": 1.5755, + "loss": 1.574, "step": 6628 }, { "epoch": 0.2947845804988662, - "grad_norm": 0.0855196937918663, + "grad_norm": 0.08594503253698349, "learning_rate": 0.0009901506699616601, - "loss": 1.5714, + "loss": 1.5716, "step": 6630 }, { "epoch": 0.29487350495753856, - "grad_norm": 0.08432745188474655, + "grad_norm": 0.09015947580337524, "learning_rate": 0.000990143694056784, - "loss": 1.5712, + "loss": 1.5693, "step": 6632 }, { "epoch": 0.2949624294162109, - "grad_norm": 0.08522246778011322, + "grad_norm": 0.08714131265878677, "learning_rate": 0.0009901367157069897, - "loss": 1.5727, + "loss": 1.5629, "step": 6634 }, { "epoch": 0.2950513538748833, - "grad_norm": 0.08673780411481857, + "grad_norm": 0.08628025650978088, "learning_rate": 0.0009901297349123124, - "loss": 1.5723, + "loss": 1.5687, "step": 6636 }, { "epoch": 0.29514027833355566, - "grad_norm": 0.08892252296209335, + "grad_norm": 0.09013169258832932, "learning_rate": 0.0009901227516727865, - "loss": 1.5795, + "loss": 1.5728, "step": 6638 }, { "epoch": 0.29522920279222803, - "grad_norm": 0.0852012112736702, + "grad_norm": 0.09108193218708038, "learning_rate": 0.0009901157659884467, - "loss": 1.5787, + "loss": 1.5688, "step": 6640 }, { "epoch": 0.29531812725090034, - "grad_norm": 0.08897509425878525, + "grad_norm": 0.08838926255702972, "learning_rate": 0.0009901087778593286, - "loss": 1.5768, + "loss": 1.5676, "step": 6642 }, { "epoch": 0.2954070517095727, - "grad_norm": 0.08539470285177231, + "grad_norm": 0.085484080016613, "learning_rate": 0.0009901017872854662, - "loss": 1.5712, + "loss": 1.5715, "step": 6644 }, { "epoch": 0.2954959761682451, - "grad_norm": 0.08583150058984756, + "grad_norm": 0.08966375142335892, "learning_rate": 0.0009900947942668948, - "loss": 1.5743, + "loss": 1.5704, "step": 6646 }, { "epoch": 0.29558490062691745, - "grad_norm": 0.0858711525797844, + "grad_norm": 0.08792408555746078, "learning_rate": 0.0009900877988036493, - "loss": 1.5769, + "loss": 1.5707, "step": 6648 }, { "epoch": 0.2956738250855898, - "grad_norm": 0.08887479454278946, + "grad_norm": 0.08903691917657852, "learning_rate": 0.0009900808008957642, - "loss": 1.569, + "loss": 1.5769, "step": 6650 }, { "epoch": 0.2957627495442621, - "grad_norm": 0.09324241429567337, + "grad_norm": 0.09390727430582047, "learning_rate": 0.000990073800543275, - "loss": 1.5743, + "loss": 1.5715, "step": 6652 }, { "epoch": 0.2958516740029345, - "grad_norm": 0.08654385060071945, + "grad_norm": 0.09213802218437195, "learning_rate": 0.0009900667977462163, - "loss": 1.5671, + "loss": 1.5754, "step": 6654 }, { "epoch": 0.29594059846160686, - "grad_norm": 0.08895711600780487, + "grad_norm": 0.0865757092833519, "learning_rate": 0.000990059792504623, - "loss": 1.5747, + "loss": 1.5704, "step": 6656 }, { "epoch": 0.29602952292027923, - "grad_norm": 0.09034650772809982, + "grad_norm": 0.08747228235006332, "learning_rate": 0.0009900527848185302, - "loss": 1.5768, + "loss": 1.571, "step": 6658 }, { "epoch": 0.2961184473789516, - "grad_norm": 0.08898959308862686, + "grad_norm": 0.08495338261127472, "learning_rate": 0.0009900457746879728, - "loss": 1.5739, + "loss": 1.5669, "step": 6660 }, { "epoch": 0.2962073718376239, - "grad_norm": 0.08816082775592804, + "grad_norm": 0.0894813984632492, "learning_rate": 0.0009900387621129855, - "loss": 1.5688, + "loss": 1.5695, "step": 6662 }, { "epoch": 0.2962962962962963, - "grad_norm": 0.09308087825775146, + "grad_norm": 0.08558983355760574, "learning_rate": 0.0009900317470936037, - "loss": 1.5768, + "loss": 1.5744, "step": 6664 }, { "epoch": 0.29638522075496865, - "grad_norm": 0.10000612586736679, + "grad_norm": 0.08508554100990295, "learning_rate": 0.0009900247296298621, - "loss": 1.574, + "loss": 1.5674, "step": 6666 }, { "epoch": 0.296474145213641, - "grad_norm": 0.09098842740058899, + "grad_norm": 0.08840614557266235, "learning_rate": 0.000990017709721796, - "loss": 1.5756, + "loss": 1.569, "step": 6668 }, { "epoch": 0.2965630696723134, - "grad_norm": 0.0916735976934433, + "grad_norm": 0.08688345551490784, "learning_rate": 0.0009900106873694402, - "loss": 1.5709, + "loss": 1.5724, "step": 6670 }, { "epoch": 0.29665199413098575, - "grad_norm": 0.08966685831546783, + "grad_norm": 0.0900869369506836, "learning_rate": 0.0009900036625728294, - "loss": 1.5754, + "loss": 1.5707, "step": 6672 }, { "epoch": 0.29674091858965806, - "grad_norm": 0.09121707081794739, + "grad_norm": 0.08745554089546204, "learning_rate": 0.0009899966353319994, - "loss": 1.5724, + "loss": 1.5689, "step": 6674 }, { "epoch": 0.29682984304833043, - "grad_norm": 0.08943553268909454, + "grad_norm": 0.08753221482038498, "learning_rate": 0.0009899896056469845, - "loss": 1.5718, + "loss": 1.5709, "step": 6676 }, { "epoch": 0.2969187675070028, - "grad_norm": 0.09027671813964844, + "grad_norm": 0.09187465906143188, "learning_rate": 0.0009899825735178204, - "loss": 1.5659, + "loss": 1.5717, "step": 6678 }, { "epoch": 0.29700769196567517, - "grad_norm": 0.08637960255146027, + "grad_norm": 0.08736233413219452, "learning_rate": 0.000989975538944542, - "loss": 1.5721, + "loss": 1.5741, "step": 6680 }, { "epoch": 0.29709661642434754, - "grad_norm": 0.0903075709939003, + "grad_norm": 0.08750621974468231, "learning_rate": 0.000989968501927184, - "loss": 1.5681, + "loss": 1.5702, "step": 6682 }, { "epoch": 0.29718554088301985, - "grad_norm": 0.09254927188158035, + "grad_norm": 0.08598420023918152, "learning_rate": 0.0009899614624657817, - "loss": 1.5738, + "loss": 1.5622, "step": 6684 }, { "epoch": 0.2972744653416922, - "grad_norm": 0.0908125564455986, + "grad_norm": 0.08676007390022278, "learning_rate": 0.0009899544205603705, - "loss": 1.5687, + "loss": 1.5741, "step": 6686 }, { "epoch": 0.2973633898003646, - "grad_norm": 0.08839623630046844, + "grad_norm": 0.08978342264890671, "learning_rate": 0.0009899473762109855, - "loss": 1.573, + "loss": 1.5686, "step": 6688 }, { "epoch": 0.29745231425903695, - "grad_norm": 0.09364596009254456, + "grad_norm": 0.08682754635810852, "learning_rate": 0.0009899403294176613, - "loss": 1.5744, + "loss": 1.5657, "step": 6690 }, { "epoch": 0.2975412387177093, - "grad_norm": 0.09233032912015915, + "grad_norm": 0.08936390280723572, "learning_rate": 0.0009899332801804335, - "loss": 1.575, + "loss": 1.5699, "step": 6692 }, { "epoch": 0.2976301631763817, - "grad_norm": 0.08950147777795792, + "grad_norm": 0.09042416512966156, "learning_rate": 0.0009899262284993372, - "loss": 1.5667, + "loss": 1.5695, "step": 6694 }, { "epoch": 0.297719087635054, - "grad_norm": 0.087605781853199, + "grad_norm": 0.08619032055139542, "learning_rate": 0.0009899191743744076, - "loss": 1.575, + "loss": 1.5632, "step": 6696 }, { "epoch": 0.29780801209372637, - "grad_norm": 0.09101510047912598, + "grad_norm": 0.0877993255853653, "learning_rate": 0.0009899121178056798, - "loss": 1.571, + "loss": 1.5744, "step": 6698 }, { "epoch": 0.29789693655239874, - "grad_norm": 0.09050039947032928, + "grad_norm": 0.09133929759263992, "learning_rate": 0.000989905058793189, - "loss": 1.566, + "loss": 1.5672, "step": 6700 }, { "epoch": 0.2979858610110711, - "grad_norm": 0.08767402172088623, + "grad_norm": 0.08733952045440674, "learning_rate": 0.0009898979973369706, - "loss": 1.5771, + "loss": 1.5676, "step": 6702 }, { "epoch": 0.2980747854697435, - "grad_norm": 0.08621323853731155, + "grad_norm": 0.08532746136188507, "learning_rate": 0.0009898909334370594, - "loss": 1.569, + "loss": 1.5743, "step": 6704 }, { "epoch": 0.2981637099284158, - "grad_norm": 0.08890756964683533, + "grad_norm": 0.08905241638422012, "learning_rate": 0.0009898838670934912, - "loss": 1.5766, + "loss": 1.5686, "step": 6706 }, { "epoch": 0.29825263438708816, - "grad_norm": 0.08613884449005127, + "grad_norm": 0.09203900396823883, "learning_rate": 0.0009898767983063008, - "loss": 1.5734, + "loss": 1.576, "step": 6708 }, { "epoch": 0.2983415588457605, - "grad_norm": 0.08627349883317947, + "grad_norm": 0.20671740174293518, "learning_rate": 0.0009898697270755237, - "loss": 1.5705, + "loss": 1.5761, "step": 6710 }, { "epoch": 0.2984304833044329, - "grad_norm": 0.08452925086021423, + "grad_norm": 0.22460798919200897, "learning_rate": 0.000989862653401195, - "loss": 1.571, + "loss": 1.5757, "step": 6712 }, { "epoch": 0.29851940776310526, - "grad_norm": 0.08761037141084671, + "grad_norm": 0.11507432907819748, "learning_rate": 0.00098985557728335, - "loss": 1.5689, + "loss": 1.5788, "step": 6714 }, { "epoch": 0.2986083322217776, - "grad_norm": 0.08883599191904068, + "grad_norm": 0.09615212678909302, "learning_rate": 0.0009898484987220242, - "loss": 1.5764, + "loss": 1.5698, "step": 6716 }, { "epoch": 0.29869725668044994, - "grad_norm": 0.0876917690038681, + "grad_norm": 0.14628173410892487, "learning_rate": 0.0009898414177172527, - "loss": 1.5683, + "loss": 1.5708, "step": 6718 }, { "epoch": 0.2987861811391223, - "grad_norm": 0.08556470274925232, + "grad_norm": 0.10161300748586655, "learning_rate": 0.000989834334269071, - "loss": 1.5653, + "loss": 1.5803, "step": 6720 }, { "epoch": 0.2988751055977947, - "grad_norm": 0.08305517584085464, + "grad_norm": 0.09993571043014526, "learning_rate": 0.0009898272483775143, - "loss": 1.5678, + "loss": 1.5728, "step": 6722 }, { "epoch": 0.29896403005646704, - "grad_norm": 0.08866001665592194, + "grad_norm": 0.0945572704076767, "learning_rate": 0.0009898201600426178, - "loss": 1.569, + "loss": 1.5717, "step": 6724 }, { "epoch": 0.2990529545151394, - "grad_norm": 0.08552645891904831, + "grad_norm": 0.10044775158166885, "learning_rate": 0.0009898130692644173, - "loss": 1.5668, + "loss": 1.5734, "step": 6726 }, { "epoch": 0.2991418789738117, - "grad_norm": 0.08816053718328476, + "grad_norm": 0.09027940034866333, "learning_rate": 0.0009898059760429477, - "loss": 1.5689, + "loss": 1.5767, "step": 6728 }, { "epoch": 0.2992308034324841, - "grad_norm": 0.0867980495095253, + "grad_norm": 0.09142522513866425, "learning_rate": 0.0009897988803782448, - "loss": 1.5729, + "loss": 1.5732, "step": 6730 }, { "epoch": 0.29931972789115646, - "grad_norm": 0.08712033182382584, + "grad_norm": 0.1005646288394928, "learning_rate": 0.0009897917822703436, - "loss": 1.5727, + "loss": 1.5717, "step": 6732 }, { "epoch": 0.29940865234982883, - "grad_norm": 0.08813479542732239, + "grad_norm": 0.09144769608974457, "learning_rate": 0.0009897846817192796, - "loss": 1.5752, + "loss": 1.5635, "step": 6734 }, { "epoch": 0.2994975768085012, - "grad_norm": 0.08632966876029968, + "grad_norm": 0.09377957135438919, "learning_rate": 0.0009897775787250887, - "loss": 1.5655, + "loss": 1.5698, "step": 6736 }, { "epoch": 0.2995865012671735, - "grad_norm": 0.08634035289287567, + "grad_norm": 0.09546922147274017, "learning_rate": 0.0009897704732878057, - "loss": 1.5738, + "loss": 1.5698, "step": 6738 }, { "epoch": 0.2996754257258459, - "grad_norm": 0.08398577570915222, + "grad_norm": 0.08886546641588211, "learning_rate": 0.0009897633654074663, - "loss": 1.5668, + "loss": 1.5649, "step": 6740 }, { "epoch": 0.29976435018451825, - "grad_norm": 0.08463684469461441, + "grad_norm": 0.0911850556731224, "learning_rate": 0.0009897562550841058, - "loss": 1.5767, + "loss": 1.5702, "step": 6742 }, { "epoch": 0.2998532746431906, - "grad_norm": 0.08549610525369644, + "grad_norm": 0.09427014738321304, "learning_rate": 0.0009897491423177598, - "loss": 1.5644, + "loss": 1.5766, "step": 6744 }, { "epoch": 0.299942199101863, - "grad_norm": 0.08854731172323227, + "grad_norm": 0.1038573682308197, "learning_rate": 0.000989742027108464, - "loss": 1.5677, + "loss": 1.574, "step": 6746 }, { "epoch": 0.30003112356053535, - "grad_norm": 0.08542992174625397, + "grad_norm": 0.08610797673463821, "learning_rate": 0.0009897349094562535, - "loss": 1.5724, + "loss": 1.5718, "step": 6748 }, { "epoch": 0.30012004801920766, - "grad_norm": 0.0850505530834198, + "grad_norm": 0.09136275202035904, "learning_rate": 0.0009897277893611642, - "loss": 1.5747, + "loss": 1.5716, "step": 6750 }, { "epoch": 0.30020897247788003, - "grad_norm": 0.08848162740468979, + "grad_norm": 0.09047017991542816, "learning_rate": 0.0009897206668232313, - "loss": 1.5671, + "loss": 1.5739, "step": 6752 }, { "epoch": 0.3002978969365524, - "grad_norm": 0.08766396343708038, + "grad_norm": 0.08960193395614624, "learning_rate": 0.0009897135418424903, - "loss": 1.57, + "loss": 1.5674, "step": 6754 }, { "epoch": 0.30038682139522477, - "grad_norm": 0.0817844346165657, + "grad_norm": 0.08961663395166397, "learning_rate": 0.000989706414418977, - "loss": 1.569, + "loss": 1.5759, "step": 6756 }, { "epoch": 0.30047574585389714, - "grad_norm": 0.08586768060922623, + "grad_norm": 0.08908514678478241, "learning_rate": 0.0009896992845527266, - "loss": 1.5717, + "loss": 1.5714, "step": 6758 }, { "epoch": 0.30056467031256945, - "grad_norm": 0.08401080965995789, + "grad_norm": 0.08930535614490509, "learning_rate": 0.000989692152243775, - "loss": 1.5732, + "loss": 1.5694, "step": 6760 }, { "epoch": 0.3006535947712418, - "grad_norm": 0.08628005534410477, + "grad_norm": 0.09298436343669891, "learning_rate": 0.0009896850174921578, - "loss": 1.5722, + "loss": 1.5753, "step": 6762 }, { "epoch": 0.3007425192299142, - "grad_norm": 0.082597516477108, + "grad_norm": 0.08789312839508057, "learning_rate": 0.0009896778802979103, - "loss": 1.5643, + "loss": 1.5727, "step": 6764 }, { "epoch": 0.30083144368858655, - "grad_norm": 0.08688101172447205, + "grad_norm": 0.08950556069612503, "learning_rate": 0.0009896707406610683, - "loss": 1.5691, + "loss": 1.5732, "step": 6766 }, { "epoch": 0.3009203681472589, - "grad_norm": 0.08769115060567856, + "grad_norm": 0.0896010622382164, "learning_rate": 0.0009896635985816673, - "loss": 1.5687, + "loss": 1.5746, "step": 6768 }, { "epoch": 0.3010092926059313, - "grad_norm": 0.08656857162714005, + "grad_norm": 0.0874442532658577, "learning_rate": 0.0009896564540597431, - "loss": 1.5761, + "loss": 1.5708, "step": 6770 }, { "epoch": 0.3010982170646036, - "grad_norm": 0.08850487321615219, + "grad_norm": 0.08776126801967621, "learning_rate": 0.0009896493070953311, - "loss": 1.569, + "loss": 1.5758, "step": 6772 }, { "epoch": 0.30118714152327597, - "grad_norm": 0.08894409239292145, + "grad_norm": 0.08940175920724869, "learning_rate": 0.0009896421576884673, - "loss": 1.5736, + "loss": 1.5719, "step": 6774 }, { "epoch": 0.30127606598194834, - "grad_norm": 0.08888617902994156, + "grad_norm": 0.08825140446424484, "learning_rate": 0.000989635005839187, - "loss": 1.5695, + "loss": 1.5693, "step": 6776 }, { "epoch": 0.3013649904406207, - "grad_norm": 0.08289193361997604, + "grad_norm": 0.08824607729911804, "learning_rate": 0.000989627851547526, - "loss": 1.5692, + "loss": 1.5658, "step": 6778 }, { "epoch": 0.3014539148992931, - "grad_norm": 0.08422806859016418, + "grad_norm": 0.08684536069631577, "learning_rate": 0.00098962069481352, - "loss": 1.5659, + "loss": 1.5636, "step": 6780 }, { "epoch": 0.3015428393579654, - "grad_norm": 0.0865112841129303, + "grad_norm": 0.08771684765815735, "learning_rate": 0.0009896135356372046, - "loss": 1.5711, + "loss": 1.573, "step": 6782 }, { "epoch": 0.30163176381663775, - "grad_norm": 0.08711813390254974, + "grad_norm": 0.08880012482404709, "learning_rate": 0.0009896063740186159, - "loss": 1.5691, + "loss": 1.5673, "step": 6784 }, { "epoch": 0.3017206882753101, - "grad_norm": 0.08997444808483124, + "grad_norm": 0.08657423406839371, "learning_rate": 0.000989599209957789, - "loss": 1.5626, + "loss": 1.5675, "step": 6786 }, { "epoch": 0.3018096127339825, - "grad_norm": 0.08878400176763535, + "grad_norm": 0.0867740735411644, "learning_rate": 0.0009895920434547603, - "loss": 1.5712, + "loss": 1.5677, "step": 6788 }, { "epoch": 0.30189853719265486, - "grad_norm": 0.08929741382598877, + "grad_norm": 0.08525516837835312, "learning_rate": 0.000989584874509565, - "loss": 1.5704, + "loss": 1.5654, "step": 6790 }, { "epoch": 0.30198746165132717, - "grad_norm": 0.09282039105892181, + "grad_norm": 0.08830083906650543, "learning_rate": 0.0009895777031222392, - "loss": 1.5689, + "loss": 1.5672, "step": 6792 }, { "epoch": 0.30207638610999954, - "grad_norm": 0.08617766946554184, + "grad_norm": 0.0843716561794281, "learning_rate": 0.0009895705292928185, - "loss": 1.5718, + "loss": 1.5728, "step": 6794 }, { "epoch": 0.3021653105686719, - "grad_norm": 0.08884678781032562, + "grad_norm": 0.0873875841498375, "learning_rate": 0.0009895633530213387, - "loss": 1.5664, + "loss": 1.5615, "step": 6796 }, { "epoch": 0.3022542350273443, - "grad_norm": 0.09330635517835617, + "grad_norm": 0.08721845597028732, "learning_rate": 0.0009895561743078357, - "loss": 1.5718, + "loss": 1.5626, "step": 6798 }, { "epoch": 0.30234315948601664, - "grad_norm": 0.09254085272550583, + "grad_norm": 0.09410282224416733, "learning_rate": 0.0009895489931523451, - "loss": 1.5694, + "loss": 1.5711, "step": 6800 }, { "epoch": 0.302432083944689, - "grad_norm": 0.08448589593172073, + "grad_norm": 0.0908014252781868, "learning_rate": 0.000989541809554903, - "loss": 1.57, + "loss": 1.5703, "step": 6802 }, { "epoch": 0.3025210084033613, - "grad_norm": 0.08677145093679428, + "grad_norm": 0.08996500074863434, "learning_rate": 0.0009895346235155452, - "loss": 1.5702, + "loss": 1.5712, "step": 6804 }, { "epoch": 0.3026099328620337, - "grad_norm": 0.0891798809170723, + "grad_norm": 0.08390094339847565, "learning_rate": 0.000989527435034307, - "loss": 1.566, + "loss": 1.5733, "step": 6806 }, { "epoch": 0.30269885732070606, - "grad_norm": 0.08697128295898438, + "grad_norm": 0.08549515902996063, "learning_rate": 0.0009895202441112252, - "loss": 1.5727, + "loss": 1.5645, "step": 6808 }, { "epoch": 0.30278778177937843, - "grad_norm": 0.08771656453609467, + "grad_norm": 0.08361940085887909, "learning_rate": 0.000989513050746335, - "loss": 1.5665, + "loss": 1.569, "step": 6810 }, { "epoch": 0.3028767062380508, - "grad_norm": 0.08761769533157349, + "grad_norm": 0.08554235100746155, "learning_rate": 0.0009895058549396723, - "loss": 1.571, + "loss": 1.5686, "step": 6812 }, { "epoch": 0.3029656306967231, - "grad_norm": 0.08636198937892914, + "grad_norm": 0.08416851609945297, "learning_rate": 0.0009894986566912733, - "loss": 1.5675, + "loss": 1.5664, "step": 6814 }, { "epoch": 0.3030545551553955, - "grad_norm": 0.08739641308784485, + "grad_norm": 0.08411978930234909, "learning_rate": 0.0009894914560011736, - "loss": 1.5693, + "loss": 1.5648, "step": 6816 }, { "epoch": 0.30314347961406785, - "grad_norm": 0.08318977057933807, + "grad_norm": 0.08546824753284454, "learning_rate": 0.0009894842528694095, - "loss": 1.5738, + "loss": 1.5651, "step": 6818 }, { "epoch": 0.3032324040727402, - "grad_norm": 0.0846974328160286, + "grad_norm": 0.08174564689397812, "learning_rate": 0.0009894770472960164, - "loss": 1.564, + "loss": 1.5734, "step": 6820 }, { "epoch": 0.3033213285314126, - "grad_norm": 0.08627136796712875, + "grad_norm": 0.08649101853370667, "learning_rate": 0.0009894698392810307, - "loss": 1.568, + "loss": 1.5737, "step": 6822 }, { "epoch": 0.30341025299008495, - "grad_norm": 0.08277648687362671, + "grad_norm": 0.0850704163312912, "learning_rate": 0.0009894626288244883, - "loss": 1.5683, + "loss": 1.5716, "step": 6824 }, { "epoch": 0.30349917744875726, - "grad_norm": 0.08420710265636444, + "grad_norm": 0.08794053643941879, "learning_rate": 0.0009894554159264248, - "loss": 1.5724, + "loss": 1.569, "step": 6826 }, { "epoch": 0.30358810190742963, - "grad_norm": 0.08524946123361588, + "grad_norm": 0.08536579459905624, "learning_rate": 0.0009894482005868767, - "loss": 1.5631, + "loss": 1.5673, "step": 6828 }, { "epoch": 0.303677026366102, - "grad_norm": 0.08944839984178543, + "grad_norm": 0.08639626950025558, "learning_rate": 0.0009894409828058795, - "loss": 1.5632, + "loss": 1.5668, "step": 6830 }, { "epoch": 0.30376595082477437, - "grad_norm": 0.08237938582897186, + "grad_norm": 0.0900898426771164, "learning_rate": 0.0009894337625834695, - "loss": 1.5659, + "loss": 1.566, "step": 6832 }, { "epoch": 0.30385487528344673, - "grad_norm": 0.08536158502101898, + "grad_norm": 0.07988198101520538, "learning_rate": 0.0009894265399196828, - "loss": 1.5684, + "loss": 1.5659, "step": 6834 }, { "epoch": 0.30394379974211905, - "grad_norm": 0.08481699973344803, + "grad_norm": 0.08728215098381042, "learning_rate": 0.0009894193148145551, - "loss": 1.57, + "loss": 1.5667, "step": 6836 }, { "epoch": 0.3040327242007914, - "grad_norm": 0.08328551054000854, + "grad_norm": 0.08969441801309586, "learning_rate": 0.000989412087268123, - "loss": 1.5671, + "loss": 1.5712, "step": 6838 }, { "epoch": 0.3041216486594638, - "grad_norm": 0.08307883888483047, + "grad_norm": 0.0852469727396965, "learning_rate": 0.0009894048572804217, - "loss": 1.564, + "loss": 1.576, "step": 6840 }, { "epoch": 0.30421057311813615, - "grad_norm": 0.08421258628368378, + "grad_norm": 0.08066057413816452, "learning_rate": 0.000989397624851488, - "loss": 1.5631, + "loss": 1.5643, "step": 6842 }, { "epoch": 0.3042994975768085, - "grad_norm": 0.0846334770321846, + "grad_norm": 0.08737246692180634, "learning_rate": 0.0009893903899813575, - "loss": 1.5634, + "loss": 1.5672, "step": 6844 }, { "epoch": 0.30438842203548083, - "grad_norm": 0.08747091889381409, + "grad_norm": 0.08560658246278763, "learning_rate": 0.0009893831526700666, - "loss": 1.5723, + "loss": 1.5696, "step": 6846 }, { "epoch": 0.3044773464941532, - "grad_norm": 0.08625800907611847, + "grad_norm": 0.0847938284277916, "learning_rate": 0.0009893759129176514, - "loss": 1.5721, + "loss": 1.5641, "step": 6848 }, { "epoch": 0.30456627095282557, - "grad_norm": 0.08458415418863297, + "grad_norm": 0.08661984652280807, "learning_rate": 0.000989368670724148, - "loss": 1.5708, + "loss": 1.5687, "step": 6850 }, { "epoch": 0.30465519541149794, - "grad_norm": 0.08511239290237427, + "grad_norm": 0.08374079316854477, "learning_rate": 0.000989361426089592, - "loss": 1.568, + "loss": 1.5665, "step": 6852 }, { "epoch": 0.3047441198701703, - "grad_norm": 0.08559386432170868, + "grad_norm": 0.08264697343111038, "learning_rate": 0.0009893541790140202, - "loss": 1.5731, + "loss": 1.5669, "step": 6854 }, { "epoch": 0.3048330443288427, - "grad_norm": 0.08498529344797134, + "grad_norm": 0.08502277731895447, "learning_rate": 0.0009893469294974687, - "loss": 1.5673, + "loss": 1.5616, "step": 6856 }, { "epoch": 0.304921968787515, - "grad_norm": 0.08727027475833893, + "grad_norm": 0.08124475181102753, "learning_rate": 0.0009893396775399732, - "loss": 1.5682, + "loss": 1.5685, "step": 6858 }, { "epoch": 0.30501089324618735, - "grad_norm": 0.0869452953338623, + "grad_norm": 0.08523332327604294, "learning_rate": 0.0009893324231415703, - "loss": 1.5667, + "loss": 1.5616, "step": 6860 }, { "epoch": 0.3050998177048597, - "grad_norm": 0.0843091756105423, + "grad_norm": 0.08559540659189224, "learning_rate": 0.0009893251663022963, - "loss": 1.574, + "loss": 1.5699, "step": 6862 }, { "epoch": 0.3051887421635321, - "grad_norm": 0.08598976582288742, + "grad_norm": 0.08699463307857513, "learning_rate": 0.000989317907022187, - "loss": 1.5722, + "loss": 1.5725, "step": 6864 }, { "epoch": 0.30527766662220446, - "grad_norm": 0.09492961317300797, + "grad_norm": 0.09094923734664917, "learning_rate": 0.0009893106453012785, - "loss": 1.5621, + "loss": 1.5698, "step": 6866 }, { "epoch": 0.30536659108087677, - "grad_norm": 0.09004738181829453, + "grad_norm": 0.08431901037693024, "learning_rate": 0.0009893033811396075, - "loss": 1.5711, + "loss": 1.5723, "step": 6868 }, { "epoch": 0.30545551553954914, - "grad_norm": 0.08967278897762299, + "grad_norm": 0.08522892743349075, "learning_rate": 0.00098929611453721, - "loss": 1.5721, + "loss": 1.5645, "step": 6870 }, { "epoch": 0.3055444399982215, - "grad_norm": 0.09083294868469238, + "grad_norm": 0.08593329787254333, "learning_rate": 0.000989288845494122, - "loss": 1.5652, + "loss": 1.5708, "step": 6872 }, { "epoch": 0.3056333644568939, - "grad_norm": 0.08368532359600067, + "grad_norm": 0.08536642789840698, "learning_rate": 0.0009892815740103805, - "loss": 1.5633, + "loss": 1.5676, "step": 6874 }, { "epoch": 0.30572228891556624, - "grad_norm": 0.08711462467908859, + "grad_norm": 0.08635646104812622, "learning_rate": 0.000989274300086021, - "loss": 1.573, + "loss": 1.5669, "step": 6876 }, { "epoch": 0.3058112133742386, - "grad_norm": 0.08847599476575851, + "grad_norm": 0.09484677016735077, "learning_rate": 0.00098926702372108, - "loss": 1.569, + "loss": 1.5701, "step": 6878 }, { "epoch": 0.3059001378329109, - "grad_norm": 0.08406464010477066, + "grad_norm": 0.08951367437839508, "learning_rate": 0.000989259744915594, - "loss": 1.5731, + "loss": 1.5668, "step": 6880 }, { "epoch": 0.3059890622915833, - "grad_norm": 0.08603305369615555, + "grad_norm": 0.0868958830833435, "learning_rate": 0.0009892524636695992, - "loss": 1.5643, + "loss": 1.5653, "step": 6882 }, { "epoch": 0.30607798675025566, - "grad_norm": 0.08495331555604935, + "grad_norm": 0.08445209264755249, "learning_rate": 0.0009892451799831318, - "loss": 1.5658, + "loss": 1.5696, "step": 6884 }, { "epoch": 0.306166911208928, - "grad_norm": 0.08606935292482376, + "grad_norm": 0.08534303307533264, "learning_rate": 0.0009892378938562284, - "loss": 1.5641, + "loss": 1.5702, "step": 6886 }, { "epoch": 0.3062558356676004, - "grad_norm": 0.08674218505620956, + "grad_norm": 0.09028670191764832, "learning_rate": 0.0009892306052889251, - "loss": 1.5754, + "loss": 1.5702, "step": 6888 }, { "epoch": 0.3063447601262727, - "grad_norm": 0.08873141556978226, + "grad_norm": 0.08568151295185089, "learning_rate": 0.0009892233142812584, - "loss": 1.5666, + "loss": 1.5752, "step": 6890 }, { "epoch": 0.3064336845849451, - "grad_norm": 0.08596460521221161, + "grad_norm": 0.08130050450563431, "learning_rate": 0.0009892160208332644, - "loss": 1.568, + "loss": 1.5648, "step": 6892 }, { "epoch": 0.30652260904361744, - "grad_norm": 0.08701075613498688, + "grad_norm": 0.08191067725419998, "learning_rate": 0.0009892087249449798, - "loss": 1.5651, + "loss": 1.5686, "step": 6894 }, { "epoch": 0.3066115335022898, - "grad_norm": 0.09671713411808014, + "grad_norm": 0.08696281164884567, "learning_rate": 0.0009892014266164408, - "loss": 1.5728, + "loss": 1.5645, "step": 6896 }, { "epoch": 0.3067004579609622, - "grad_norm": 0.09461056441068649, + "grad_norm": 0.09006158262491226, "learning_rate": 0.000989194125847684, - "loss": 1.5615, + "loss": 1.5665, "step": 6898 }, { "epoch": 0.3067893824196345, - "grad_norm": 0.0874013602733612, + "grad_norm": 0.08667782694101334, "learning_rate": 0.0009891868226387455, - "loss": 1.569, + "loss": 1.5627, "step": 6900 }, { "epoch": 0.30687830687830686, - "grad_norm": 0.08886938542127609, + "grad_norm": 0.08659591525793076, "learning_rate": 0.0009891795169896622, - "loss": 1.5652, + "loss": 1.5606, "step": 6902 }, { "epoch": 0.30696723133697923, - "grad_norm": 0.08856808394193649, + "grad_norm": 0.0922919362783432, "learning_rate": 0.0009891722089004702, - "loss": 1.5633, + "loss": 1.5638, "step": 6904 }, { "epoch": 0.3070561557956516, - "grad_norm": 0.08798424154520035, + "grad_norm": 0.09142550826072693, "learning_rate": 0.0009891648983712058, - "loss": 1.5665, + "loss": 1.5736, "step": 6906 }, { "epoch": 0.30714508025432397, - "grad_norm": 0.08057578653097153, + "grad_norm": 0.085682712495327, "learning_rate": 0.0009891575854019058, - "loss": 1.5743, + "loss": 1.5656, "step": 6908 }, { "epoch": 0.30723400471299633, - "grad_norm": 0.08890609443187714, + "grad_norm": 0.0815911665558815, "learning_rate": 0.0009891502699926067, - "loss": 1.5616, + "loss": 1.5646, "step": 6910 }, { "epoch": 0.30732292917166865, - "grad_norm": 0.09081149101257324, + "grad_norm": 0.08682998269796371, "learning_rate": 0.0009891429521433448, - "loss": 1.5688, + "loss": 1.5631, "step": 6912 }, { "epoch": 0.307411853630341, - "grad_norm": 0.08633463829755783, + "grad_norm": 0.0891093984246254, "learning_rate": 0.0009891356318541566, - "loss": 1.5676, + "loss": 1.5678, "step": 6914 }, { "epoch": 0.3075007780890134, - "grad_norm": 0.08525501191616058, + "grad_norm": 0.08850381523370743, "learning_rate": 0.000989128309125079, - "loss": 1.5688, + "loss": 1.5682, "step": 6916 }, { "epoch": 0.30758970254768575, - "grad_norm": 0.08166355639696121, + "grad_norm": 0.08838870376348495, "learning_rate": 0.0009891209839561477, - "loss": 1.5673, + "loss": 1.5721, "step": 6918 }, { "epoch": 0.3076786270063581, - "grad_norm": 0.0858922079205513, + "grad_norm": 0.08760442584753036, "learning_rate": 0.0009891136563474, - "loss": 1.5689, + "loss": 1.5692, "step": 6920 }, { "epoch": 0.30776755146503043, - "grad_norm": 0.08496145159006119, + "grad_norm": 0.08644498139619827, "learning_rate": 0.0009891063262988723, - "loss": 1.5636, + "loss": 1.566, "step": 6922 }, { "epoch": 0.3078564759237028, - "grad_norm": 0.08461374044418335, + "grad_norm": 0.08855952322483063, "learning_rate": 0.0009890989938106008, - "loss": 1.5683, + "loss": 1.567, "step": 6924 }, { "epoch": 0.30794540038237517, - "grad_norm": 0.08948436379432678, + "grad_norm": 0.08846352994441986, "learning_rate": 0.0009890916588826224, - "loss": 1.5657, + "loss": 1.5695, "step": 6926 }, { "epoch": 0.30803432484104754, - "grad_norm": 0.09179355949163437, + "grad_norm": 0.08989361673593521, "learning_rate": 0.0009890843215149736, "loss": 1.568, "step": 6928 }, { "epoch": 0.3081232492997199, - "grad_norm": 0.09200924634933472, + "grad_norm": 0.0872996598482132, "learning_rate": 0.0009890769817076912, - "loss": 1.567, + "loss": 1.5655, "step": 6930 }, { "epoch": 0.30821217375839227, - "grad_norm": 0.09283183515071869, + "grad_norm": 0.08718729019165039, "learning_rate": 0.0009890696394608115, - "loss": 1.5718, + "loss": 1.5768, "step": 6932 }, { "epoch": 0.3083010982170646, - "grad_norm": 0.11160538345575333, + "grad_norm": 0.08712806552648544, "learning_rate": 0.0009890622947743713, - "loss": 1.5676, + "loss": 1.5647, "step": 6934 }, { "epoch": 0.30839002267573695, - "grad_norm": 0.08908142149448395, + "grad_norm": 0.0897906944155693, "learning_rate": 0.0009890549476484073, - "loss": 1.57, + "loss": 1.5751, "step": 6936 }, { "epoch": 0.3084789471344093, - "grad_norm": 0.08438417315483093, + "grad_norm": 0.1027601882815361, "learning_rate": 0.0009890475980829558, - "loss": 1.5668, + "loss": 1.5664, "step": 6938 }, { "epoch": 0.3085678715930817, - "grad_norm": 0.08734441548585892, + "grad_norm": 0.11154686659574509, "learning_rate": 0.0009890402460780538, - "loss": 1.565, + "loss": 1.5664, "step": 6940 }, { "epoch": 0.30865679605175406, - "grad_norm": 0.08454196900129318, + "grad_norm": 0.19219696521759033, "learning_rate": 0.000989032891633738, - "loss": 1.5681, + "loss": 1.5769, "step": 6942 }, { "epoch": 0.30874572051042637, - "grad_norm": 0.08919329941272736, + "grad_norm": 0.10125477612018585, "learning_rate": 0.0009890255347500447, - "loss": 1.5709, + "loss": 1.5801, "step": 6944 }, { "epoch": 0.30883464496909874, - "grad_norm": 0.08113396912813187, + "grad_norm": 0.10321390628814697, "learning_rate": 0.0009890181754270112, - "loss": 1.5699, + "loss": 1.5713, "step": 6946 }, { "epoch": 0.3089235694277711, - "grad_norm": 0.08310727030038834, + "grad_norm": 0.09801045060157776, "learning_rate": 0.0009890108136646736, - "loss": 1.5735, + "loss": 1.5664, "step": 6948 }, { "epoch": 0.3090124938864435, - "grad_norm": 0.09053000062704086, + "grad_norm": 0.09143440425395966, "learning_rate": 0.0009890034494630688, - "loss": 1.5727, + "loss": 1.5759, "step": 6950 }, { "epoch": 0.30910141834511584, - "grad_norm": 0.08727260679006577, + "grad_norm": 0.09067144989967346, "learning_rate": 0.000988996082822234, - "loss": 1.5666, + "loss": 1.5677, "step": 6952 }, { "epoch": 0.3091903428037882, - "grad_norm": 0.0830412209033966, + "grad_norm": 0.09082882851362228, "learning_rate": 0.0009889887137422054, - "loss": 1.5735, + "loss": 1.5746, "step": 6954 }, { "epoch": 0.3092792672624605, - "grad_norm": 0.08488724380731583, + "grad_norm": 0.087799571454525, "learning_rate": 0.0009889813422230198, - "loss": 1.5732, + "loss": 1.5667, "step": 6956 }, { "epoch": 0.3093681917211329, - "grad_norm": 0.1368086040019989, + "grad_norm": 0.08629163354635239, "learning_rate": 0.000988973968264714, - "loss": 1.5706, + "loss": 1.5645, "step": 6958 }, { "epoch": 0.30945711617980526, - "grad_norm": 0.08950672298669815, + "grad_norm": 0.08560052514076233, "learning_rate": 0.000988966591867325, - "loss": 1.57, + "loss": 1.5647, "step": 6960 }, { "epoch": 0.3095460406384776, - "grad_norm": 0.09720595926046371, + "grad_norm": 0.08497489988803864, "learning_rate": 0.0009889592130308898, - "loss": 1.5716, + "loss": 1.5611, "step": 6962 }, { "epoch": 0.30963496509715, - "grad_norm": 0.5454443693161011, + "grad_norm": 0.08397437632083893, "learning_rate": 0.0009889518317554445, - "loss": 1.5856, + "loss": 1.5667, "step": 6964 }, { "epoch": 0.3097238895558223, - "grad_norm": 14.046714782714844, + "grad_norm": 0.08864140510559082, "learning_rate": 0.0009889444480410267, - "loss": 3.2087, + "loss": 1.5644, "step": 6966 }, { "epoch": 0.3098128140144947, - "grad_norm": 11.143640518188477, + "grad_norm": 0.0851002037525177, "learning_rate": 0.0009889370618876723, - "loss": 3.4962, + "loss": 1.5669, "step": 6968 }, { "epoch": 0.30990173847316704, - "grad_norm": 13.60962200164795, + "grad_norm": 0.08869633823633194, "learning_rate": 0.000988929673295419, - "loss": 7.0729, + "loss": 1.5676, "step": 6970 }, { "epoch": 0.3099906629318394, - "grad_norm": 17.425151824951172, + "grad_norm": 0.08750467002391815, "learning_rate": 0.0009889222822643032, - "loss": 7.9451, + "loss": 1.5641, "step": 6972 }, { "epoch": 0.3100795873905118, - "grad_norm": 1.2768274545669556, + "grad_norm": 0.08245990425348282, "learning_rate": 0.000988914888794362, - "loss": 7.8122, + "loss": 1.5615, "step": 6974 }, { "epoch": 0.3101685118491841, - "grad_norm": 1.536791443824768, + "grad_norm": 0.08736477047204971, "learning_rate": 0.0009889074928856323, - "loss": 7.7427, + "loss": 1.5645, "step": 6976 }, { "epoch": 0.31025743630785646, - "grad_norm": 1.5738643407821655, + "grad_norm": 0.09274744242429733, "learning_rate": 0.0009889000945381506, - "loss": 9.3257, + "loss": 1.5684, "step": 6978 }, { "epoch": 0.31034636076652883, - "grad_norm": 0.8974425792694092, + "grad_norm": 0.08163709938526154, "learning_rate": 0.0009888926937519542, - "loss": 7.3177, + "loss": 1.5637, "step": 6980 }, { "epoch": 0.3104352852252012, - "grad_norm": 0.7683629989624023, + "grad_norm": 0.08836798369884491, "learning_rate": 0.00098888529052708, - "loss": 6.9533, + "loss": 1.565, "step": 6982 }, { "epoch": 0.31052420968387356, - "grad_norm": 0.803458034992218, + "grad_norm": 0.08659311383962631, "learning_rate": 0.0009888778848635646, - "loss": 6.9117, + "loss": 1.5653, "step": 6984 }, { "epoch": 0.31061313414254593, - "grad_norm": 0.44032931327819824, + "grad_norm": 0.08404355496168137, "learning_rate": 0.0009888704767614452, - "loss": 6.806, + "loss": 1.5669, "step": 6986 }, { "epoch": 0.31070205860121825, - "grad_norm": 0.3152598440647125, + "grad_norm": 0.08402011543512344, "learning_rate": 0.000988863066220759, - "loss": 6.7285, + "loss": 1.5588, "step": 6988 }, { "epoch": 0.3107909830598906, - "grad_norm": 0.3047941327095032, + "grad_norm": 0.08973820507526398, "learning_rate": 0.0009888556532415423, - "loss": 6.7029, + "loss": 1.5668, "step": 6990 }, { "epoch": 0.310879907518563, - "grad_norm": 0.23191867768764496, + "grad_norm": 0.08140117675065994, "learning_rate": 0.0009888482378238325, - "loss": 6.6799, + "loss": 1.5649, "step": 6992 }, { "epoch": 0.31096883197723535, - "grad_norm": 0.1787997931241989, + "grad_norm": 0.08229751884937286, "learning_rate": 0.0009888408199676666, - "loss": 6.6358, + "loss": 1.5577, "step": 6994 }, { "epoch": 0.3110577564359077, - "grad_norm": 0.17656953632831573, + "grad_norm": 0.08518966287374496, "learning_rate": 0.0009888333996730817, - "loss": 6.5889, + "loss": 1.5675, "step": 6996 }, { "epoch": 0.31114668089458003, - "grad_norm": 0.18115170300006866, + "grad_norm": 0.0844472348690033, "learning_rate": 0.0009888259769401148, - "loss": 6.5493, + "loss": 1.5685, "step": 6998 }, { "epoch": 0.3112356053532524, - "grad_norm": 0.18638622760772705, + "grad_norm": 0.08895467966794968, "learning_rate": 0.0009888185517688023, - "loss": 6.4761, + "loss": 1.5693, "step": 7000 }, { "epoch": 0.3112356053532524, - "eval_loss": 6.37610387802124, - "eval_runtime": 12.361, - "eval_samples_per_second": 559.018, - "eval_steps_per_second": 69.897, + "eval_loss": 1.5398365259170532, + "eval_runtime": 12.9215, + "eval_samples_per_second": 534.768, + "eval_steps_per_second": 66.865, "step": 7000 } ],