{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9977046671767407, "eval_steps": 500, "global_step": 1959, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015302218821729151, "grad_norm": 2.9595158525528062, "learning_rate": 1.0204081632653061e-05, "loss": 1.7025, "step": 10 }, { "epoch": 0.030604437643458302, "grad_norm": 1.263751808272248, "learning_rate": 2.0408163265306123e-05, "loss": 1.4094, "step": 20 }, { "epoch": 0.045906656465187455, "grad_norm": 1.1314499693827325, "learning_rate": 3.061224489795919e-05, "loss": 1.0742, "step": 30 }, { "epoch": 0.061208875286916604, "grad_norm": 0.43429385320294256, "learning_rate": 4.0816326530612245e-05, "loss": 0.8975, "step": 40 }, { "epoch": 0.07651109410864575, "grad_norm": 0.37910893323587813, "learning_rate": 5.102040816326531e-05, "loss": 0.7694, "step": 50 }, { "epoch": 0.09181331293037491, "grad_norm": 0.32985782659377816, "learning_rate": 6.122448979591838e-05, "loss": 0.6667, "step": 60 }, { "epoch": 0.10711553175210406, "grad_norm": 0.27993689719153514, "learning_rate": 7.142857142857143e-05, "loss": 0.6143, "step": 70 }, { "epoch": 0.12241775057383321, "grad_norm": 0.2562350918748776, "learning_rate": 8.163265306122449e-05, "loss": 0.5736, "step": 80 }, { "epoch": 0.13771996939556236, "grad_norm": 0.2627549443187762, "learning_rate": 9.183673469387756e-05, "loss": 0.5605, "step": 90 }, { "epoch": 0.1530221882172915, "grad_norm": 0.25450163445897056, "learning_rate": 0.00010204081632653062, "loss": 0.5425, "step": 100 }, { "epoch": 0.16832440703902066, "grad_norm": 0.265185130228763, "learning_rate": 0.00011224489795918367, "loss": 0.5417, "step": 110 }, { "epoch": 0.18362662586074982, "grad_norm": 0.2600738826510011, "learning_rate": 0.00012244897959183676, "loss": 0.5349, "step": 120 }, { "epoch": 0.19892884468247896, "grad_norm": 0.27451283782332153, "learning_rate": 0.0001326530612244898, "loss": 0.5225, "step": 130 }, { "epoch": 0.21423106350420812, "grad_norm": 0.29276216948080613, "learning_rate": 0.00014285714285714287, "loss": 0.5148, "step": 140 }, { "epoch": 0.22953328232593725, "grad_norm": 0.25373117999248507, "learning_rate": 0.0001530612244897959, "loss": 0.5108, "step": 150 }, { "epoch": 0.24483550114766642, "grad_norm": 0.26479254050664824, "learning_rate": 0.00016326530612244898, "loss": 0.5061, "step": 160 }, { "epoch": 0.26013771996939555, "grad_norm": 0.27232811822105624, "learning_rate": 0.00017346938775510205, "loss": 0.5065, "step": 170 }, { "epoch": 0.2754399387911247, "grad_norm": 0.26418337252250673, "learning_rate": 0.00018367346938775512, "loss": 0.4967, "step": 180 }, { "epoch": 0.2907421576128539, "grad_norm": 0.2503450300406005, "learning_rate": 0.00019387755102040816, "loss": 0.4998, "step": 190 }, { "epoch": 0.306044376434583, "grad_norm": 0.2339540170507543, "learning_rate": 0.0001999974597101728, "loss": 0.4989, "step": 200 }, { "epoch": 0.32134659525631215, "grad_norm": 0.23446730330407825, "learning_rate": 0.0001999688829317862, "loss": 0.4895, "step": 210 }, { "epoch": 0.3366488140780413, "grad_norm": 0.23685036500213205, "learning_rate": 0.00019990856311693857, "loss": 0.4898, "step": 220 }, { "epoch": 0.3519510328997705, "grad_norm": 0.2396529148643123, "learning_rate": 0.00019981651941893068, "loss": 0.4881, "step": 230 }, { "epoch": 0.36725325172149964, "grad_norm": 0.22794906865901965, "learning_rate": 0.0001996927810643216, "loss": 0.4825, "step": 240 }, { "epoch": 0.38255547054322875, "grad_norm": 0.21749849325438408, "learning_rate": 0.00019953738734364843, "loss": 0.4905, "step": 250 }, { "epoch": 0.3978576893649579, "grad_norm": 0.21941306857106413, "learning_rate": 0.00019935038759895038, "loss": 0.4844, "step": 260 }, { "epoch": 0.4131599081866871, "grad_norm": 0.2106883472247568, "learning_rate": 0.0001991318412081012, "loss": 0.4832, "step": 270 }, { "epoch": 0.42846212700841624, "grad_norm": 0.20643151525949935, "learning_rate": 0.00019888181756595513, "loss": 0.4732, "step": 280 }, { "epoch": 0.44376434583014535, "grad_norm": 0.21375448358620805, "learning_rate": 0.0001986003960623118, "loss": 0.4777, "step": 290 }, { "epoch": 0.4590665646518745, "grad_norm": 0.2078386748706879, "learning_rate": 0.0001982876660567078, "loss": 0.4773, "step": 300 }, { "epoch": 0.4743687834736037, "grad_norm": 0.20709767688640485, "learning_rate": 0.00019794372685004232, "loss": 0.4774, "step": 310 }, { "epoch": 0.48967100229533284, "grad_norm": 0.20114204358521104, "learning_rate": 0.00019756868765304637, "loss": 0.4724, "step": 320 }, { "epoch": 0.504973221117062, "grad_norm": 0.2047995080216857, "learning_rate": 0.000197162667551605, "loss": 0.4723, "step": 330 }, { "epoch": 0.5202754399387911, "grad_norm": 0.2065404899680565, "learning_rate": 0.00019672579546894418, "loss": 0.4744, "step": 340 }, { "epoch": 0.5355776587605203, "grad_norm": 0.20339328597837086, "learning_rate": 0.00019625821012469392, "loss": 0.469, "step": 350 }, { "epoch": 0.5508798775822494, "grad_norm": 0.20269051264889382, "learning_rate": 0.0001957600599908406, "loss": 0.4652, "step": 360 }, { "epoch": 0.5661820964039785, "grad_norm": 0.20140204074578216, "learning_rate": 0.00019523150324458297, "loss": 0.4663, "step": 370 }, { "epoch": 0.5814843152257078, "grad_norm": 0.20145479660046747, "learning_rate": 0.0001946727077181062, "loss": 0.4647, "step": 380 }, { "epoch": 0.5967865340474369, "grad_norm": 0.19812410832421173, "learning_rate": 0.00019408385084529014, "loss": 0.4669, "step": 390 }, { "epoch": 0.612088752869166, "grad_norm": 0.20593747645685082, "learning_rate": 0.0001934651196053692, "loss": 0.4606, "step": 400 }, { "epoch": 0.6273909716908952, "grad_norm": 0.19985586154896637, "learning_rate": 0.00019281671046356057, "loss": 0.465, "step": 410 }, { "epoch": 0.6426931905126243, "grad_norm": 0.18804149710765375, "learning_rate": 0.0001921388293086812, "loss": 0.4591, "step": 420 }, { "epoch": 0.6579954093343535, "grad_norm": 0.20403982281128524, "learning_rate": 0.00019143169138777176, "loss": 0.4612, "step": 430 }, { "epoch": 0.6732976281560826, "grad_norm": 0.19472172680549107, "learning_rate": 0.00019069552123774966, "loss": 0.4535, "step": 440 }, { "epoch": 0.6885998469778117, "grad_norm": 0.18853093267297413, "learning_rate": 0.00018993055261411188, "loss": 0.4536, "step": 450 }, { "epoch": 0.703902065799541, "grad_norm": 0.1824795154817547, "learning_rate": 0.0001891370284167108, "loss": 0.4533, "step": 460 }, { "epoch": 0.7192042846212701, "grad_norm": 0.19623573820167492, "learning_rate": 0.00018831520061262657, "loss": 0.4608, "step": 470 }, { "epoch": 0.7345065034429993, "grad_norm": 0.1894732171051352, "learning_rate": 0.00018746533015615997, "loss": 0.4561, "step": 480 }, { "epoch": 0.7498087222647284, "grad_norm": 0.1956746346911804, "learning_rate": 0.00018658768690597198, "loss": 0.4562, "step": 490 }, { "epoch": 0.7651109410864575, "grad_norm": 0.19485019146231133, "learning_rate": 0.00018568254953939573, "loss": 0.4547, "step": 500 }, { "epoch": 0.7804131599081867, "grad_norm": 0.1900932930656561, "learning_rate": 0.0001847502054639483, "loss": 0.4563, "step": 510 }, { "epoch": 0.7957153787299158, "grad_norm": 0.18608837489031835, "learning_rate": 0.00018379095072607052, "loss": 0.4546, "step": 520 }, { "epoch": 0.811017597551645, "grad_norm": 0.19229921508866923, "learning_rate": 0.0001828050899171234, "loss": 0.4506, "step": 530 }, { "epoch": 0.8263198163733741, "grad_norm": 0.20164999528455674, "learning_rate": 0.00018179293607667178, "loss": 0.4528, "step": 540 }, { "epoch": 0.8416220351951033, "grad_norm": 0.18962588415494633, "learning_rate": 0.00018075481059308488, "loss": 0.4501, "step": 550 }, { "epoch": 0.8569242540168325, "grad_norm": 0.1878318467581915, "learning_rate": 0.00017969104310148627, "loss": 0.4489, "step": 560 }, { "epoch": 0.8722264728385616, "grad_norm": 0.19855697462159105, "learning_rate": 0.00017860197137908504, "loss": 0.4486, "step": 570 }, { "epoch": 0.8875286916602907, "grad_norm": 0.18806947832467016, "learning_rate": 0.00017748794123792169, "loss": 0.449, "step": 580 }, { "epoch": 0.9028309104820199, "grad_norm": 0.18547154568221744, "learning_rate": 0.00017634930641506272, "loss": 0.4468, "step": 590 }, { "epoch": 0.918133129303749, "grad_norm": 0.1988415349574148, "learning_rate": 0.00017518642846027876, "loss": 0.4447, "step": 600 }, { "epoch": 0.9334353481254782, "grad_norm": 0.1971157063292002, "learning_rate": 0.00017399967662124204, "loss": 0.4496, "step": 610 }, { "epoch": 0.9487375669472073, "grad_norm": 0.18821213785618154, "learning_rate": 0.00017278942772627954, "loss": 0.4442, "step": 620 }, { "epoch": 0.9640397857689365, "grad_norm": 0.1863099123502169, "learning_rate": 0.00017155606606471873, "loss": 0.4428, "step": 630 }, { "epoch": 0.9793420045906657, "grad_norm": 0.1877422977434543, "learning_rate": 0.00017029998326486485, "loss": 0.4431, "step": 640 }, { "epoch": 0.9946442234123948, "grad_norm": 0.1976160670120698, "learning_rate": 0.00016902157816964724, "loss": 0.4382, "step": 650 }, { "epoch": 1.009946442234124, "grad_norm": 0.18992306538475748, "learning_rate": 0.0001677212567099752, "loss": 0.4353, "step": 660 }, { "epoch": 1.025248661055853, "grad_norm": 0.19209926834930846, "learning_rate": 0.00016639943177584302, "loss": 0.422, "step": 670 }, { "epoch": 1.0405508798775822, "grad_norm": 0.1862611696289539, "learning_rate": 0.00016505652308522546, "loss": 0.4166, "step": 680 }, { "epoch": 1.0558530986993113, "grad_norm": 0.19413670929825885, "learning_rate": 0.00016369295705080493, "loss": 0.4214, "step": 690 }, { "epoch": 1.0711553175210407, "grad_norm": 0.19044466403960753, "learning_rate": 0.00016230916664457303, "loss": 0.4226, "step": 700 }, { "epoch": 1.0864575363427698, "grad_norm": 0.20537259284486634, "learning_rate": 0.00016090559126034955, "loss": 0.4206, "step": 710 }, { "epoch": 1.1017597551644989, "grad_norm": 0.19548453583302455, "learning_rate": 0.00015948267657426172, "loss": 0.4258, "step": 720 }, { "epoch": 1.117061973986228, "grad_norm": 0.19233863084054012, "learning_rate": 0.00015804087440322937, "loss": 0.4212, "step": 730 }, { "epoch": 1.132364192807957, "grad_norm": 0.20059506523438514, "learning_rate": 0.00015658064256149972, "loss": 0.4248, "step": 740 }, { "epoch": 1.1476664116296864, "grad_norm": 0.19781190556296033, "learning_rate": 0.00015510244471527798, "loss": 0.419, "step": 750 }, { "epoch": 1.1629686304514155, "grad_norm": 0.1978522622616877, "learning_rate": 0.0001536067502355, "loss": 0.4156, "step": 760 }, { "epoch": 1.1782708492731446, "grad_norm": 0.19356090367390197, "learning_rate": 0.00015209403404879303, "loss": 0.4176, "step": 770 }, { "epoch": 1.1935730680948737, "grad_norm": 0.21041098882140832, "learning_rate": 0.0001505647764866729, "loss": 0.4155, "step": 780 }, { "epoch": 1.2088752869166028, "grad_norm": 0.19309297495512096, "learning_rate": 0.00014901946313302452, "loss": 0.4125, "step": 790 }, { "epoch": 1.2241775057383322, "grad_norm": 0.20988205146033515, "learning_rate": 0.0001474585846699151, "loss": 0.4198, "step": 800 }, { "epoch": 1.2394797245600613, "grad_norm": 0.19899429014402967, "learning_rate": 0.00014588263672178812, "loss": 0.4139, "step": 810 }, { "epoch": 1.2547819433817904, "grad_norm": 0.19678065811835893, "learning_rate": 0.00014429211969808808, "loss": 0.4168, "step": 820 }, { "epoch": 1.2700841622035195, "grad_norm": 0.20776458272762527, "learning_rate": 0.00014268753863436602, "loss": 0.4137, "step": 830 }, { "epoch": 1.2853863810252486, "grad_norm": 0.2056945302468653, "learning_rate": 0.00014106940303191583, "loss": 0.4166, "step": 840 }, { "epoch": 1.300688599846978, "grad_norm": 0.20731916359275016, "learning_rate": 0.000139438226695993, "loss": 0.4179, "step": 850 }, { "epoch": 1.315990818668707, "grad_norm": 0.2001144508118885, "learning_rate": 0.00013779452757266617, "loss": 0.4131, "step": 860 }, { "epoch": 1.3312930374904361, "grad_norm": 0.19624161193905473, "learning_rate": 0.00013613882758435435, "loss": 0.4089, "step": 870 }, { "epoch": 1.3465952563121653, "grad_norm": 0.20129114730483852, "learning_rate": 0.0001344716524641012, "loss": 0.4149, "step": 880 }, { "epoch": 1.3618974751338944, "grad_norm": 0.207327826679874, "learning_rate": 0.0001327935315886395, "loss": 0.4097, "step": 890 }, { "epoch": 1.3771996939556237, "grad_norm": 0.2029298678382067, "learning_rate": 0.00013110499781029874, "loss": 0.4132, "step": 900 }, { "epoch": 1.3925019127773526, "grad_norm": 0.20764777212748992, "learning_rate": 0.00012940658728780862, "loss": 0.4142, "step": 910 }, { "epoch": 1.407804131599082, "grad_norm": 0.2017977764168179, "learning_rate": 0.00012769883931605333, "loss": 0.4212, "step": 920 }, { "epoch": 1.423106350420811, "grad_norm": 0.1932903343458798, "learning_rate": 0.00012598229615482954, "loss": 0.4127, "step": 930 }, { "epoch": 1.4384085692425401, "grad_norm": 0.20197440137591363, "learning_rate": 0.0001242575028566632, "loss": 0.4118, "step": 940 }, { "epoch": 1.4537107880642695, "grad_norm": 0.20030727904644482, "learning_rate": 0.00012252500709373934, "loss": 0.4133, "step": 950 }, { "epoch": 1.4690130068859983, "grad_norm": 0.20837354923941284, "learning_rate": 0.00012078535898400019, "loss": 0.4117, "step": 960 }, { "epoch": 1.4843152257077277, "grad_norm": 0.19925595673498087, "learning_rate": 0.00011903911091646684, "loss": 0.4083, "step": 970 }, { "epoch": 1.4996174445294568, "grad_norm": 0.2084484154721651, "learning_rate": 0.00011728681737583945, "loss": 0.408, "step": 980 }, { "epoch": 1.5149196633511859, "grad_norm": 0.20623569681522963, "learning_rate": 0.00011552903476643222, "loss": 0.4117, "step": 990 }, { "epoch": 1.5302218821729152, "grad_norm": 0.19912897780166905, "learning_rate": 0.0001137663212354988, "loss": 0.4071, "step": 1000 }, { "epoch": 1.545524100994644, "grad_norm": 0.2015032395684377, "learning_rate": 0.00011199923649600432, "loss": 0.4085, "step": 1010 }, { "epoch": 1.5608263198163734, "grad_norm": 0.19846674785205634, "learning_rate": 0.0001102283416489001, "loss": 0.4094, "step": 1020 }, { "epoch": 1.5761285386381025, "grad_norm": 0.19503368149396527, "learning_rate": 0.00010845419900495772, "loss": 0.4031, "step": 1030 }, { "epoch": 1.5914307574598316, "grad_norm": 0.20636304448971973, "learning_rate": 0.00010667737190621911, "loss": 0.4141, "step": 1040 }, { "epoch": 1.606732976281561, "grad_norm": 0.19741819850815578, "learning_rate": 0.0001048984245471188, "loss": 0.4123, "step": 1050 }, { "epoch": 1.6220351951032899, "grad_norm": 0.208094236192064, "learning_rate": 0.00010311792179533589, "loss": 0.4094, "step": 1060 }, { "epoch": 1.6373374139250192, "grad_norm": 0.20834394157642375, "learning_rate": 0.00010133642901243199, "loss": 0.4073, "step": 1070 }, { "epoch": 1.6526396327467483, "grad_norm": 0.20325976490051528, "learning_rate": 9.955451187433249e-05, "loss": 0.4072, "step": 1080 }, { "epoch": 1.6679418515684774, "grad_norm": 0.19996940436744937, "learning_rate": 9.777273619170796e-05, "loss": 0.4067, "step": 1090 }, { "epoch": 1.6832440703902067, "grad_norm": 0.19609861659837347, "learning_rate": 9.599166773031269e-05, "loss": 0.405, "step": 1100 }, { "epoch": 1.6985462892119356, "grad_norm": 0.20661663114073753, "learning_rate": 9.421187203133763e-05, "loss": 0.4089, "step": 1110 }, { "epoch": 1.713848508033665, "grad_norm": 0.19921721692896047, "learning_rate": 9.243391423183448e-05, "loss": 0.4071, "step": 1120 }, { "epoch": 1.729150726855394, "grad_norm": 0.20297141939912075, "learning_rate": 9.06583588852683e-05, "loss": 0.4043, "step": 1130 }, { "epoch": 1.7444529456771232, "grad_norm": 0.20714391729089626, "learning_rate": 8.888576978225527e-05, "loss": 0.4031, "step": 1140 }, { "epoch": 1.7597551644988525, "grad_norm": 0.20284798828869055, "learning_rate": 8.711670977154274e-05, "loss": 0.4051, "step": 1150 }, { "epoch": 1.7750573833205814, "grad_norm": 0.20621816413716504, "learning_rate": 8.535174058128812e-05, "loss": 0.4028, "step": 1160 }, { "epoch": 1.7903596021423107, "grad_norm": 0.2107861816265634, "learning_rate": 8.359142264069424e-05, "loss": 0.3972, "step": 1170 }, { "epoch": 1.8056618209640398, "grad_norm": 0.20926246575879495, "learning_rate": 8.183631490205637e-05, "loss": 0.4033, "step": 1180 }, { "epoch": 1.820964039785769, "grad_norm": 0.20398570550310058, "learning_rate": 8.008697466327865e-05, "loss": 0.3994, "step": 1190 }, { "epoch": 1.836266258607498, "grad_norm": 0.21408610304309247, "learning_rate": 7.834395739091585e-05, "loss": 0.3971, "step": 1200 }, { "epoch": 1.8515684774292271, "grad_norm": 0.2059678534725396, "learning_rate": 7.660781654379638e-05, "loss": 0.4004, "step": 1210 }, { "epoch": 1.8668706962509565, "grad_norm": 0.20089108656511492, "learning_rate": 7.487910339728308e-05, "loss": 0.398, "step": 1220 }, { "epoch": 1.8821729150726856, "grad_norm": 0.2089057556225053, "learning_rate": 7.315836686822729e-05, "loss": 0.4, "step": 1230 }, { "epoch": 1.8974751338944147, "grad_norm": 0.2076979091587327, "learning_rate": 7.14461533406714e-05, "loss": 0.3942, "step": 1240 }, { "epoch": 1.9127773527161438, "grad_norm": 0.2075858197523019, "learning_rate": 6.974300649235633e-05, "loss": 0.4004, "step": 1250 }, { "epoch": 1.928079571537873, "grad_norm": 0.21210310960815237, "learning_rate": 6.804946712208793e-05, "loss": 0.4021, "step": 1260 }, { "epoch": 1.9433817903596022, "grad_norm": 0.21341887577251997, "learning_rate": 6.63660729780174e-05, "loss": 0.3954, "step": 1270 }, { "epoch": 1.9586840091813313, "grad_norm": 0.22172559452313256, "learning_rate": 6.469335858689074e-05, "loss": 0.4002, "step": 1280 }, { "epoch": 1.9739862280030605, "grad_norm": 0.20709389048082577, "learning_rate": 6.303185508432085e-05, "loss": 0.4018, "step": 1290 }, { "epoch": 1.9892884468247896, "grad_norm": 0.21370736036441443, "learning_rate": 6.138209004613647e-05, "loss": 0.3955, "step": 1300 }, { "epoch": 2.0045906656465187, "grad_norm": 0.20094204383719153, "learning_rate": 5.974458732086149e-05, "loss": 0.3851, "step": 1310 }, { "epoch": 2.019892884468248, "grad_norm": 0.22289055701062488, "learning_rate": 5.81198668633778e-05, "loss": 0.3674, "step": 1320 }, { "epoch": 2.035195103289977, "grad_norm": 0.22724332895866212, "learning_rate": 5.6508444569824315e-05, "loss": 0.3614, "step": 1330 }, { "epoch": 2.050497322111706, "grad_norm": 0.23888425948419684, "learning_rate": 5.491083211378505e-05, "loss": 0.3614, "step": 1340 }, { "epoch": 2.0657995409334355, "grad_norm": 0.23068081709857463, "learning_rate": 5.3327536783817766e-05, "loss": 0.3644, "step": 1350 }, { "epoch": 2.0811017597551644, "grad_norm": 0.2460582871625401, "learning_rate": 5.1759061322375045e-05, "loss": 0.3634, "step": 1360 }, { "epoch": 2.0964039785768938, "grad_norm": 0.24787239171921766, "learning_rate": 5.0205903766168915e-05, "loss": 0.3612, "step": 1370 }, { "epoch": 2.1117061973986226, "grad_norm": 0.23563510103472043, "learning_rate": 4.8668557288029684e-05, "loss": 0.3627, "step": 1380 }, { "epoch": 2.127008416220352, "grad_norm": 0.23966764070729765, "learning_rate": 4.7147510040309115e-05, "loss": 0.3633, "step": 1390 }, { "epoch": 2.1423106350420813, "grad_norm": 0.24153476110732292, "learning_rate": 4.56432449998779e-05, "loss": 0.3621, "step": 1400 }, { "epoch": 2.15761285386381, "grad_norm": 0.23966649969853318, "learning_rate": 4.4156239814766355e-05, "loss": 0.3597, "step": 1410 }, { "epoch": 2.1729150726855395, "grad_norm": 0.2492626290666687, "learning_rate": 4.268696665249724e-05, "loss": 0.3663, "step": 1420 }, { "epoch": 2.1882172915072684, "grad_norm": 0.24762132725809086, "learning_rate": 4.1235892050158866e-05, "loss": 0.3604, "step": 1430 }, { "epoch": 2.2035195103289977, "grad_norm": 0.24689838661058983, "learning_rate": 3.9803476766265835e-05, "loss": 0.3648, "step": 1440 }, { "epoch": 2.218821729150727, "grad_norm": 0.24705256051578678, "learning_rate": 3.839017563445489e-05, "loss": 0.3625, "step": 1450 }, { "epoch": 2.234123947972456, "grad_norm": 0.2496443256139868, "learning_rate": 3.699643741906193e-05, "loss": 0.3545, "step": 1460 }, { "epoch": 2.2494261667941853, "grad_norm": 0.249005841241916, "learning_rate": 3.562270467262619e-05, "loss": 0.3601, "step": 1470 }, { "epoch": 2.264728385615914, "grad_norm": 0.24757649232051404, "learning_rate": 3.426941359536699e-05, "loss": 0.3576, "step": 1480 }, { "epoch": 2.2800306044376435, "grad_norm": 0.25451870114574876, "learning_rate": 3.293699389667734e-05, "loss": 0.3648, "step": 1490 }, { "epoch": 2.295332823259373, "grad_norm": 0.2453131567916715, "learning_rate": 3.1625868658678784e-05, "loss": 0.3625, "step": 1500 }, { "epoch": 2.3106350420811017, "grad_norm": 0.26123253732278273, "learning_rate": 3.0336454201880404e-05, "loss": 0.3557, "step": 1510 }, { "epoch": 2.325937260902831, "grad_norm": 0.2626580334343511, "learning_rate": 2.9069159952984938e-05, "loss": 0.3566, "step": 1520 }, { "epoch": 2.34123947972456, "grad_norm": 0.25003501828393526, "learning_rate": 2.7824388314883876e-05, "loss": 0.3591, "step": 1530 }, { "epoch": 2.3565416985462893, "grad_norm": 0.2507207739640118, "learning_rate": 2.6602534538882752e-05, "loss": 0.358, "step": 1540 }, { "epoch": 2.371843917368018, "grad_norm": 0.2512257287691549, "learning_rate": 2.5403986599197403e-05, "loss": 0.3501, "step": 1550 }, { "epoch": 2.3871461361897475, "grad_norm": 0.2499142391014787, "learning_rate": 2.4229125069760773e-05, "loss": 0.3517, "step": 1560 }, { "epoch": 2.402448355011477, "grad_norm": 0.24884412260444377, "learning_rate": 2.30783230033796e-05, "loss": 0.3568, "step": 1570 }, { "epoch": 2.4177505738332057, "grad_norm": 0.25335712662759613, "learning_rate": 2.1951945813279306e-05, "loss": 0.3547, "step": 1580 }, { "epoch": 2.433052792654935, "grad_norm": 0.24937746984746909, "learning_rate": 2.0850351157074598e-05, "loss": 0.353, "step": 1590 }, { "epoch": 2.4483550114766643, "grad_norm": 0.26237405083230225, "learning_rate": 1.9773888823202747e-05, "loss": 0.3579, "step": 1600 }, { "epoch": 2.4636572302983932, "grad_norm": 0.24937407019841143, "learning_rate": 1.8722900619855577e-05, "loss": 0.3562, "step": 1610 }, { "epoch": 2.4789594491201226, "grad_norm": 0.2497739493217595, "learning_rate": 1.7697720266445374e-05, "loss": 0.3512, "step": 1620 }, { "epoch": 2.4942616679418514, "grad_norm": 0.250399923768968, "learning_rate": 1.6698673287639242e-05, "loss": 0.3556, "step": 1630 }, { "epoch": 2.5095638867635808, "grad_norm": 0.2538234603963528, "learning_rate": 1.5726076909995525e-05, "loss": 0.355, "step": 1640 }, { "epoch": 2.5248661055853097, "grad_norm": 0.2504434686740776, "learning_rate": 1.4780239961235143e-05, "loss": 0.3581, "step": 1650 }, { "epoch": 2.540168324407039, "grad_norm": 0.2537006354023553, "learning_rate": 1.3861462772179735e-05, "loss": 0.3529, "step": 1660 }, { "epoch": 2.5554705432287683, "grad_norm": 0.2560925692389684, "learning_rate": 1.297003708138792e-05, "loss": 0.3584, "step": 1670 }, { "epoch": 2.570772762050497, "grad_norm": 0.25340005468102866, "learning_rate": 1.2106245942519745e-05, "loss": 0.3562, "step": 1680 }, { "epoch": 2.5860749808722265, "grad_norm": 0.25785387176401525, "learning_rate": 1.1270363634458903e-05, "loss": 0.3541, "step": 1690 }, { "epoch": 2.601377199693956, "grad_norm": 0.2615280811952495, "learning_rate": 1.0462655574221213e-05, "loss": 0.3553, "step": 1700 }, { "epoch": 2.6166794185156848, "grad_norm": 0.24663819526398556, "learning_rate": 9.683378232676965e-06, "loss": 0.3532, "step": 1710 }, { "epoch": 2.631981637337414, "grad_norm": 0.2614751928374674, "learning_rate": 8.932779053113893e-06, "loss": 0.3546, "step": 1720 }, { "epoch": 2.647283856159143, "grad_norm": 0.25884670404819016, "learning_rate": 8.211096372666783e-06, "loss": 0.3559, "step": 1730 }, { "epoch": 2.6625860749808723, "grad_norm": 0.26303763365287397, "learning_rate": 7.518559346638432e-06, "loss": 0.3556, "step": 1740 }, { "epoch": 2.677888293802601, "grad_norm": 0.2591833975916801, "learning_rate": 6.855387875736152e-06, "loss": 0.3577, "step": 1750 }, { "epoch": 2.6931905126243305, "grad_norm": 0.24688502674320847, "learning_rate": 6.221792536246973e-06, "loss": 0.3557, "step": 1760 }, { "epoch": 2.70849273144606, "grad_norm": 0.26100100849151864, "learning_rate": 5.617974513173341e-06, "loss": 0.3548, "step": 1770 }, { "epoch": 2.7237949502677887, "grad_norm": 0.2699346579349574, "learning_rate": 5.044125536351196e-06, "loss": 0.3511, "step": 1780 }, { "epoch": 2.739097169089518, "grad_norm": 0.2717440390716044, "learning_rate": 4.500427819570097e-06, "loss": 0.3494, "step": 1790 }, { "epoch": 2.7543993879112474, "grad_norm": 0.2579283254483208, "learning_rate": 3.987054002714952e-06, "loss": 0.3569, "step": 1800 }, { "epoch": 2.7697016067329763, "grad_norm": 0.2594723269046269, "learning_rate": 3.504167096947952e-06, "loss": 0.3516, "step": 1810 }, { "epoch": 2.785003825554705, "grad_norm": 0.256455133609715, "learning_rate": 3.051920432947664e-06, "loss": 0.3512, "step": 1820 }, { "epoch": 2.8003060443764345, "grad_norm": 0.2621246657840175, "learning_rate": 2.6304576122221035e-06, "loss": 0.3527, "step": 1830 }, { "epoch": 2.815608263198164, "grad_norm": 0.26278995363196544, "learning_rate": 2.2399124615110846e-06, "loss": 0.3518, "step": 1840 }, { "epoch": 2.8309104820198927, "grad_norm": 0.27091602282401345, "learning_rate": 1.880408990292315e-06, "loss": 0.3581, "step": 1850 }, { "epoch": 2.846212700841622, "grad_norm": 0.25572786473847814, "learning_rate": 1.5520613514047655e-06, "loss": 0.3562, "step": 1860 }, { "epoch": 2.8615149196633514, "grad_norm": 0.26244627787054337, "learning_rate": 1.2549738048017846e-06, "loss": 0.3552, "step": 1870 }, { "epoch": 2.8768171384850802, "grad_norm": 0.26131103676147655, "learning_rate": 9.892406844456026e-07, "loss": 0.352, "step": 1880 }, { "epoch": 2.8921193573068096, "grad_norm": 0.27449669022834305, "learning_rate": 7.549463683534374e-07, "loss": 0.3504, "step": 1890 }, { "epoch": 2.907421576128539, "grad_norm": 0.264605800831863, "learning_rate": 5.521652518051368e-07, "loss": 0.3515, "step": 1900 }, { "epoch": 2.922723794950268, "grad_norm": 0.2618398016329481, "learning_rate": 3.809617237203744e-07, "loss": 0.3561, "step": 1910 }, { "epoch": 2.9380260137719967, "grad_norm": 0.26831015620903936, "learning_rate": 2.4139014621340494e-07, "loss": 0.3527, "step": 1920 }, { "epoch": 2.953328232593726, "grad_norm": 0.26666302239559175, "learning_rate": 1.334948373314493e-07, "loss": 0.3562, "step": 1930 }, { "epoch": 2.9686304514154553, "grad_norm": 0.262359494195635, "learning_rate": 5.7310056982418094e-08, "loss": 0.355, "step": 1940 }, { "epoch": 2.9839326702371842, "grad_norm": 0.2576422839291312, "learning_rate": 1.2859996056402423e-08, "loss": 0.3531, "step": 1950 }, { "epoch": 2.9977046671767407, "step": 1959, "total_flos": 3660773289099264.0, "train_loss": 0.4337311900958168, "train_runtime": 5331.8775, "train_samples_per_second": 5.882, "train_steps_per_second": 0.367 } ], "logging_steps": 10, "max_steps": 1959, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3660773289099264.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }