|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9977046671767407, |
|
"eval_steps": 500, |
|
"global_step": 1959, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015302218821729151, |
|
"grad_norm": 2.9595158525528062, |
|
"learning_rate": 1.0204081632653061e-05, |
|
"loss": 1.7025, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.030604437643458302, |
|
"grad_norm": 1.263751808272248, |
|
"learning_rate": 2.0408163265306123e-05, |
|
"loss": 1.4094, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.045906656465187455, |
|
"grad_norm": 1.1314499693827325, |
|
"learning_rate": 3.061224489795919e-05, |
|
"loss": 1.0742, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.061208875286916604, |
|
"grad_norm": 0.43429385320294256, |
|
"learning_rate": 4.0816326530612245e-05, |
|
"loss": 0.8975, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07651109410864575, |
|
"grad_norm": 0.37910893323587813, |
|
"learning_rate": 5.102040816326531e-05, |
|
"loss": 0.7694, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09181331293037491, |
|
"grad_norm": 0.32985782659377816, |
|
"learning_rate": 6.122448979591838e-05, |
|
"loss": 0.6667, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10711553175210406, |
|
"grad_norm": 0.27993689719153514, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.6143, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12241775057383321, |
|
"grad_norm": 0.2562350918748776, |
|
"learning_rate": 8.163265306122449e-05, |
|
"loss": 0.5736, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13771996939556236, |
|
"grad_norm": 0.2627549443187762, |
|
"learning_rate": 9.183673469387756e-05, |
|
"loss": 0.5605, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1530221882172915, |
|
"grad_norm": 0.25450163445897056, |
|
"learning_rate": 0.00010204081632653062, |
|
"loss": 0.5425, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16832440703902066, |
|
"grad_norm": 0.265185130228763, |
|
"learning_rate": 0.00011224489795918367, |
|
"loss": 0.5417, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.18362662586074982, |
|
"grad_norm": 0.2600738826510011, |
|
"learning_rate": 0.00012244897959183676, |
|
"loss": 0.5349, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19892884468247896, |
|
"grad_norm": 0.27451283782332153, |
|
"learning_rate": 0.0001326530612244898, |
|
"loss": 0.5225, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.21423106350420812, |
|
"grad_norm": 0.29276216948080613, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.5148, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22953328232593725, |
|
"grad_norm": 0.25373117999248507, |
|
"learning_rate": 0.0001530612244897959, |
|
"loss": 0.5108, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.24483550114766642, |
|
"grad_norm": 0.26479254050664824, |
|
"learning_rate": 0.00016326530612244898, |
|
"loss": 0.5061, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.26013771996939555, |
|
"grad_norm": 0.27232811822105624, |
|
"learning_rate": 0.00017346938775510205, |
|
"loss": 0.5065, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2754399387911247, |
|
"grad_norm": 0.26418337252250673, |
|
"learning_rate": 0.00018367346938775512, |
|
"loss": 0.4967, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2907421576128539, |
|
"grad_norm": 0.2503450300406005, |
|
"learning_rate": 0.00019387755102040816, |
|
"loss": 0.4998, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.306044376434583, |
|
"grad_norm": 0.2339540170507543, |
|
"learning_rate": 0.0001999974597101728, |
|
"loss": 0.4989, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32134659525631215, |
|
"grad_norm": 0.23446730330407825, |
|
"learning_rate": 0.0001999688829317862, |
|
"loss": 0.4895, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3366488140780413, |
|
"grad_norm": 0.23685036500213205, |
|
"learning_rate": 0.00019990856311693857, |
|
"loss": 0.4898, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3519510328997705, |
|
"grad_norm": 0.2396529148643123, |
|
"learning_rate": 0.00019981651941893068, |
|
"loss": 0.4881, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.36725325172149964, |
|
"grad_norm": 0.22794906865901965, |
|
"learning_rate": 0.0001996927810643216, |
|
"loss": 0.4825, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.38255547054322875, |
|
"grad_norm": 0.21749849325438408, |
|
"learning_rate": 0.00019953738734364843, |
|
"loss": 0.4905, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3978576893649579, |
|
"grad_norm": 0.21941306857106413, |
|
"learning_rate": 0.00019935038759895038, |
|
"loss": 0.4844, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4131599081866871, |
|
"grad_norm": 0.2106883472247568, |
|
"learning_rate": 0.0001991318412081012, |
|
"loss": 0.4832, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.42846212700841624, |
|
"grad_norm": 0.20643151525949935, |
|
"learning_rate": 0.00019888181756595513, |
|
"loss": 0.4732, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.44376434583014535, |
|
"grad_norm": 0.21375448358620805, |
|
"learning_rate": 0.0001986003960623118, |
|
"loss": 0.4777, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4590665646518745, |
|
"grad_norm": 0.2078386748706879, |
|
"learning_rate": 0.0001982876660567078, |
|
"loss": 0.4773, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4743687834736037, |
|
"grad_norm": 0.20709767688640485, |
|
"learning_rate": 0.00019794372685004232, |
|
"loss": 0.4774, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.48967100229533284, |
|
"grad_norm": 0.20114204358521104, |
|
"learning_rate": 0.00019756868765304637, |
|
"loss": 0.4724, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.504973221117062, |
|
"grad_norm": 0.2047995080216857, |
|
"learning_rate": 0.000197162667551605, |
|
"loss": 0.4723, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5202754399387911, |
|
"grad_norm": 0.2065404899680565, |
|
"learning_rate": 0.00019672579546894418, |
|
"loss": 0.4744, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5355776587605203, |
|
"grad_norm": 0.20339328597837086, |
|
"learning_rate": 0.00019625821012469392, |
|
"loss": 0.469, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5508798775822494, |
|
"grad_norm": 0.20269051264889382, |
|
"learning_rate": 0.0001957600599908406, |
|
"loss": 0.4652, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5661820964039785, |
|
"grad_norm": 0.20140204074578216, |
|
"learning_rate": 0.00019523150324458297, |
|
"loss": 0.4663, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5814843152257078, |
|
"grad_norm": 0.20145479660046747, |
|
"learning_rate": 0.0001946727077181062, |
|
"loss": 0.4647, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5967865340474369, |
|
"grad_norm": 0.19812410832421173, |
|
"learning_rate": 0.00019408385084529014, |
|
"loss": 0.4669, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.612088752869166, |
|
"grad_norm": 0.20593747645685082, |
|
"learning_rate": 0.0001934651196053692, |
|
"loss": 0.4606, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6273909716908952, |
|
"grad_norm": 0.19985586154896637, |
|
"learning_rate": 0.00019281671046356057, |
|
"loss": 0.465, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6426931905126243, |
|
"grad_norm": 0.18804149710765375, |
|
"learning_rate": 0.0001921388293086812, |
|
"loss": 0.4591, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6579954093343535, |
|
"grad_norm": 0.20403982281128524, |
|
"learning_rate": 0.00019143169138777176, |
|
"loss": 0.4612, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6732976281560826, |
|
"grad_norm": 0.19472172680549107, |
|
"learning_rate": 0.00019069552123774966, |
|
"loss": 0.4535, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6885998469778117, |
|
"grad_norm": 0.18853093267297413, |
|
"learning_rate": 0.00018993055261411188, |
|
"loss": 0.4536, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.703902065799541, |
|
"grad_norm": 0.1824795154817547, |
|
"learning_rate": 0.0001891370284167108, |
|
"loss": 0.4533, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7192042846212701, |
|
"grad_norm": 0.19623573820167492, |
|
"learning_rate": 0.00018831520061262657, |
|
"loss": 0.4608, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7345065034429993, |
|
"grad_norm": 0.1894732171051352, |
|
"learning_rate": 0.00018746533015615997, |
|
"loss": 0.4561, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7498087222647284, |
|
"grad_norm": 0.1956746346911804, |
|
"learning_rate": 0.00018658768690597198, |
|
"loss": 0.4562, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7651109410864575, |
|
"grad_norm": 0.19485019146231133, |
|
"learning_rate": 0.00018568254953939573, |
|
"loss": 0.4547, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7804131599081867, |
|
"grad_norm": 0.1900932930656561, |
|
"learning_rate": 0.0001847502054639483, |
|
"loss": 0.4563, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7957153787299158, |
|
"grad_norm": 0.18608837489031835, |
|
"learning_rate": 0.00018379095072607052, |
|
"loss": 0.4546, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.811017597551645, |
|
"grad_norm": 0.19229921508866923, |
|
"learning_rate": 0.0001828050899171234, |
|
"loss": 0.4506, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8263198163733741, |
|
"grad_norm": 0.20164999528455674, |
|
"learning_rate": 0.00018179293607667178, |
|
"loss": 0.4528, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8416220351951033, |
|
"grad_norm": 0.18962588415494633, |
|
"learning_rate": 0.00018075481059308488, |
|
"loss": 0.4501, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8569242540168325, |
|
"grad_norm": 0.1878318467581915, |
|
"learning_rate": 0.00017969104310148627, |
|
"loss": 0.4489, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8722264728385616, |
|
"grad_norm": 0.19855697462159105, |
|
"learning_rate": 0.00017860197137908504, |
|
"loss": 0.4486, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8875286916602907, |
|
"grad_norm": 0.18806947832467016, |
|
"learning_rate": 0.00017748794123792169, |
|
"loss": 0.449, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.9028309104820199, |
|
"grad_norm": 0.18547154568221744, |
|
"learning_rate": 0.00017634930641506272, |
|
"loss": 0.4468, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.918133129303749, |
|
"grad_norm": 0.1988415349574148, |
|
"learning_rate": 0.00017518642846027876, |
|
"loss": 0.4447, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9334353481254782, |
|
"grad_norm": 0.1971157063292002, |
|
"learning_rate": 0.00017399967662124204, |
|
"loss": 0.4496, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9487375669472073, |
|
"grad_norm": 0.18821213785618154, |
|
"learning_rate": 0.00017278942772627954, |
|
"loss": 0.4442, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9640397857689365, |
|
"grad_norm": 0.1863099123502169, |
|
"learning_rate": 0.00017155606606471873, |
|
"loss": 0.4428, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9793420045906657, |
|
"grad_norm": 0.1877422977434543, |
|
"learning_rate": 0.00017029998326486485, |
|
"loss": 0.4431, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9946442234123948, |
|
"grad_norm": 0.1976160670120698, |
|
"learning_rate": 0.00016902157816964724, |
|
"loss": 0.4382, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.009946442234124, |
|
"grad_norm": 0.18992306538475748, |
|
"learning_rate": 0.0001677212567099752, |
|
"loss": 0.4353, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.025248661055853, |
|
"grad_norm": 0.19209926834930846, |
|
"learning_rate": 0.00016639943177584302, |
|
"loss": 0.422, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0405508798775822, |
|
"grad_norm": 0.1862611696289539, |
|
"learning_rate": 0.00016505652308522546, |
|
"loss": 0.4166, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0558530986993113, |
|
"grad_norm": 0.19413670929825885, |
|
"learning_rate": 0.00016369295705080493, |
|
"loss": 0.4214, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0711553175210407, |
|
"grad_norm": 0.19044466403960753, |
|
"learning_rate": 0.00016230916664457303, |
|
"loss": 0.4226, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0864575363427698, |
|
"grad_norm": 0.20537259284486634, |
|
"learning_rate": 0.00016090559126034955, |
|
"loss": 0.4206, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1017597551644989, |
|
"grad_norm": 0.19548453583302455, |
|
"learning_rate": 0.00015948267657426172, |
|
"loss": 0.4258, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.117061973986228, |
|
"grad_norm": 0.19233863084054012, |
|
"learning_rate": 0.00015804087440322937, |
|
"loss": 0.4212, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.132364192807957, |
|
"grad_norm": 0.20059506523438514, |
|
"learning_rate": 0.00015658064256149972, |
|
"loss": 0.4248, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.1476664116296864, |
|
"grad_norm": 0.19781190556296033, |
|
"learning_rate": 0.00015510244471527798, |
|
"loss": 0.419, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1629686304514155, |
|
"grad_norm": 0.1978522622616877, |
|
"learning_rate": 0.0001536067502355, |
|
"loss": 0.4156, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1782708492731446, |
|
"grad_norm": 0.19356090367390197, |
|
"learning_rate": 0.00015209403404879303, |
|
"loss": 0.4176, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1935730680948737, |
|
"grad_norm": 0.21041098882140832, |
|
"learning_rate": 0.0001505647764866729, |
|
"loss": 0.4155, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2088752869166028, |
|
"grad_norm": 0.19309297495512096, |
|
"learning_rate": 0.00014901946313302452, |
|
"loss": 0.4125, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2241775057383322, |
|
"grad_norm": 0.20988205146033515, |
|
"learning_rate": 0.0001474585846699151, |
|
"loss": 0.4198, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2394797245600613, |
|
"grad_norm": 0.19899429014402967, |
|
"learning_rate": 0.00014588263672178812, |
|
"loss": 0.4139, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.2547819433817904, |
|
"grad_norm": 0.19678065811835893, |
|
"learning_rate": 0.00014429211969808808, |
|
"loss": 0.4168, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2700841622035195, |
|
"grad_norm": 0.20776458272762527, |
|
"learning_rate": 0.00014268753863436602, |
|
"loss": 0.4137, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2853863810252486, |
|
"grad_norm": 0.2056945302468653, |
|
"learning_rate": 0.00014106940303191583, |
|
"loss": 0.4166, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.300688599846978, |
|
"grad_norm": 0.20731916359275016, |
|
"learning_rate": 0.000139438226695993, |
|
"loss": 0.4179, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.315990818668707, |
|
"grad_norm": 0.2001144508118885, |
|
"learning_rate": 0.00013779452757266617, |
|
"loss": 0.4131, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3312930374904361, |
|
"grad_norm": 0.19624161193905473, |
|
"learning_rate": 0.00013613882758435435, |
|
"loss": 0.4089, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3465952563121653, |
|
"grad_norm": 0.20129114730483852, |
|
"learning_rate": 0.0001344716524641012, |
|
"loss": 0.4149, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.3618974751338944, |
|
"grad_norm": 0.207327826679874, |
|
"learning_rate": 0.0001327935315886395, |
|
"loss": 0.4097, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.3771996939556237, |
|
"grad_norm": 0.2029298678382067, |
|
"learning_rate": 0.00013110499781029874, |
|
"loss": 0.4132, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3925019127773526, |
|
"grad_norm": 0.20764777212748992, |
|
"learning_rate": 0.00012940658728780862, |
|
"loss": 0.4142, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.407804131599082, |
|
"grad_norm": 0.2017977764168179, |
|
"learning_rate": 0.00012769883931605333, |
|
"loss": 0.4212, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.423106350420811, |
|
"grad_norm": 0.1932903343458798, |
|
"learning_rate": 0.00012598229615482954, |
|
"loss": 0.4127, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.4384085692425401, |
|
"grad_norm": 0.20197440137591363, |
|
"learning_rate": 0.0001242575028566632, |
|
"loss": 0.4118, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4537107880642695, |
|
"grad_norm": 0.20030727904644482, |
|
"learning_rate": 0.00012252500709373934, |
|
"loss": 0.4133, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4690130068859983, |
|
"grad_norm": 0.20837354923941284, |
|
"learning_rate": 0.00012078535898400019, |
|
"loss": 0.4117, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.4843152257077277, |
|
"grad_norm": 0.19925595673498087, |
|
"learning_rate": 0.00011903911091646684, |
|
"loss": 0.4083, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4996174445294568, |
|
"grad_norm": 0.2084484154721651, |
|
"learning_rate": 0.00011728681737583945, |
|
"loss": 0.408, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5149196633511859, |
|
"grad_norm": 0.20623569681522963, |
|
"learning_rate": 0.00011552903476643222, |
|
"loss": 0.4117, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.5302218821729152, |
|
"grad_norm": 0.19912897780166905, |
|
"learning_rate": 0.0001137663212354988, |
|
"loss": 0.4071, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.545524100994644, |
|
"grad_norm": 0.2015032395684377, |
|
"learning_rate": 0.00011199923649600432, |
|
"loss": 0.4085, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5608263198163734, |
|
"grad_norm": 0.19846674785205634, |
|
"learning_rate": 0.0001102283416489001, |
|
"loss": 0.4094, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5761285386381025, |
|
"grad_norm": 0.19503368149396527, |
|
"learning_rate": 0.00010845419900495772, |
|
"loss": 0.4031, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.5914307574598316, |
|
"grad_norm": 0.20636304448971973, |
|
"learning_rate": 0.00010667737190621911, |
|
"loss": 0.4141, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.606732976281561, |
|
"grad_norm": 0.19741819850815578, |
|
"learning_rate": 0.0001048984245471188, |
|
"loss": 0.4123, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.6220351951032899, |
|
"grad_norm": 0.208094236192064, |
|
"learning_rate": 0.00010311792179533589, |
|
"loss": 0.4094, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.6373374139250192, |
|
"grad_norm": 0.20834394157642375, |
|
"learning_rate": 0.00010133642901243199, |
|
"loss": 0.4073, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6526396327467483, |
|
"grad_norm": 0.20325976490051528, |
|
"learning_rate": 9.955451187433249e-05, |
|
"loss": 0.4072, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6679418515684774, |
|
"grad_norm": 0.19996940436744937, |
|
"learning_rate": 9.777273619170796e-05, |
|
"loss": 0.4067, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6832440703902067, |
|
"grad_norm": 0.19609861659837347, |
|
"learning_rate": 9.599166773031269e-05, |
|
"loss": 0.405, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6985462892119356, |
|
"grad_norm": 0.20661663114073753, |
|
"learning_rate": 9.421187203133763e-05, |
|
"loss": 0.4089, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.713848508033665, |
|
"grad_norm": 0.19921721692896047, |
|
"learning_rate": 9.243391423183448e-05, |
|
"loss": 0.4071, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.729150726855394, |
|
"grad_norm": 0.20297141939912075, |
|
"learning_rate": 9.06583588852683e-05, |
|
"loss": 0.4043, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.7444529456771232, |
|
"grad_norm": 0.20714391729089626, |
|
"learning_rate": 8.888576978225527e-05, |
|
"loss": 0.4031, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.7597551644988525, |
|
"grad_norm": 0.20284798828869055, |
|
"learning_rate": 8.711670977154274e-05, |
|
"loss": 0.4051, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7750573833205814, |
|
"grad_norm": 0.20621816413716504, |
|
"learning_rate": 8.535174058128812e-05, |
|
"loss": 0.4028, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7903596021423107, |
|
"grad_norm": 0.2107861816265634, |
|
"learning_rate": 8.359142264069424e-05, |
|
"loss": 0.3972, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.8056618209640398, |
|
"grad_norm": 0.20926246575879495, |
|
"learning_rate": 8.183631490205637e-05, |
|
"loss": 0.4033, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.820964039785769, |
|
"grad_norm": 0.20398570550310058, |
|
"learning_rate": 8.008697466327865e-05, |
|
"loss": 0.3994, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.836266258607498, |
|
"grad_norm": 0.21408610304309247, |
|
"learning_rate": 7.834395739091585e-05, |
|
"loss": 0.3971, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.8515684774292271, |
|
"grad_norm": 0.2059678534725396, |
|
"learning_rate": 7.660781654379638e-05, |
|
"loss": 0.4004, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.8668706962509565, |
|
"grad_norm": 0.20089108656511492, |
|
"learning_rate": 7.487910339728308e-05, |
|
"loss": 0.398, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8821729150726856, |
|
"grad_norm": 0.2089057556225053, |
|
"learning_rate": 7.315836686822729e-05, |
|
"loss": 0.4, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8974751338944147, |
|
"grad_norm": 0.2076979091587327, |
|
"learning_rate": 7.14461533406714e-05, |
|
"loss": 0.3942, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.9127773527161438, |
|
"grad_norm": 0.2075858197523019, |
|
"learning_rate": 6.974300649235633e-05, |
|
"loss": 0.4004, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.928079571537873, |
|
"grad_norm": 0.21210310960815237, |
|
"learning_rate": 6.804946712208793e-05, |
|
"loss": 0.4021, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.9433817903596022, |
|
"grad_norm": 0.21341887577251997, |
|
"learning_rate": 6.63660729780174e-05, |
|
"loss": 0.3954, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9586840091813313, |
|
"grad_norm": 0.22172559452313256, |
|
"learning_rate": 6.469335858689074e-05, |
|
"loss": 0.4002, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.9739862280030605, |
|
"grad_norm": 0.20709389048082577, |
|
"learning_rate": 6.303185508432085e-05, |
|
"loss": 0.4018, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.9892884468247896, |
|
"grad_norm": 0.21370736036441443, |
|
"learning_rate": 6.138209004613647e-05, |
|
"loss": 0.3955, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0045906656465187, |
|
"grad_norm": 0.20094204383719153, |
|
"learning_rate": 5.974458732086149e-05, |
|
"loss": 0.3851, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.019892884468248, |
|
"grad_norm": 0.22289055701062488, |
|
"learning_rate": 5.81198668633778e-05, |
|
"loss": 0.3674, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.035195103289977, |
|
"grad_norm": 0.22724332895866212, |
|
"learning_rate": 5.6508444569824315e-05, |
|
"loss": 0.3614, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.050497322111706, |
|
"grad_norm": 0.23888425948419684, |
|
"learning_rate": 5.491083211378505e-05, |
|
"loss": 0.3614, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.0657995409334355, |
|
"grad_norm": 0.23068081709857463, |
|
"learning_rate": 5.3327536783817766e-05, |
|
"loss": 0.3644, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.0811017597551644, |
|
"grad_norm": 0.2460582871625401, |
|
"learning_rate": 5.1759061322375045e-05, |
|
"loss": 0.3634, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0964039785768938, |
|
"grad_norm": 0.24787239171921766, |
|
"learning_rate": 5.0205903766168915e-05, |
|
"loss": 0.3612, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.1117061973986226, |
|
"grad_norm": 0.23563510103472043, |
|
"learning_rate": 4.8668557288029684e-05, |
|
"loss": 0.3627, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.127008416220352, |
|
"grad_norm": 0.23966764070729765, |
|
"learning_rate": 4.7147510040309115e-05, |
|
"loss": 0.3633, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.1423106350420813, |
|
"grad_norm": 0.24153476110732292, |
|
"learning_rate": 4.56432449998779e-05, |
|
"loss": 0.3621, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.15761285386381, |
|
"grad_norm": 0.23966649969853318, |
|
"learning_rate": 4.4156239814766355e-05, |
|
"loss": 0.3597, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.1729150726855395, |
|
"grad_norm": 0.2492626290666687, |
|
"learning_rate": 4.268696665249724e-05, |
|
"loss": 0.3663, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.1882172915072684, |
|
"grad_norm": 0.24762132725809086, |
|
"learning_rate": 4.1235892050158866e-05, |
|
"loss": 0.3604, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.2035195103289977, |
|
"grad_norm": 0.24689838661058983, |
|
"learning_rate": 3.9803476766265835e-05, |
|
"loss": 0.3648, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.218821729150727, |
|
"grad_norm": 0.24705256051578678, |
|
"learning_rate": 3.839017563445489e-05, |
|
"loss": 0.3625, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.234123947972456, |
|
"grad_norm": 0.2496443256139868, |
|
"learning_rate": 3.699643741906193e-05, |
|
"loss": 0.3545, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.2494261667941853, |
|
"grad_norm": 0.249005841241916, |
|
"learning_rate": 3.562270467262619e-05, |
|
"loss": 0.3601, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.264728385615914, |
|
"grad_norm": 0.24757649232051404, |
|
"learning_rate": 3.426941359536699e-05, |
|
"loss": 0.3576, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.2800306044376435, |
|
"grad_norm": 0.25451870114574876, |
|
"learning_rate": 3.293699389667734e-05, |
|
"loss": 0.3648, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.295332823259373, |
|
"grad_norm": 0.2453131567916715, |
|
"learning_rate": 3.1625868658678784e-05, |
|
"loss": 0.3625, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.3106350420811017, |
|
"grad_norm": 0.26123253732278273, |
|
"learning_rate": 3.0336454201880404e-05, |
|
"loss": 0.3557, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.325937260902831, |
|
"grad_norm": 0.2626580334343511, |
|
"learning_rate": 2.9069159952984938e-05, |
|
"loss": 0.3566, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.34123947972456, |
|
"grad_norm": 0.25003501828393526, |
|
"learning_rate": 2.7824388314883876e-05, |
|
"loss": 0.3591, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.3565416985462893, |
|
"grad_norm": 0.2507207739640118, |
|
"learning_rate": 2.6602534538882752e-05, |
|
"loss": 0.358, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.371843917368018, |
|
"grad_norm": 0.2512257287691549, |
|
"learning_rate": 2.5403986599197403e-05, |
|
"loss": 0.3501, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3871461361897475, |
|
"grad_norm": 0.2499142391014787, |
|
"learning_rate": 2.4229125069760773e-05, |
|
"loss": 0.3517, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.402448355011477, |
|
"grad_norm": 0.24884412260444377, |
|
"learning_rate": 2.30783230033796e-05, |
|
"loss": 0.3568, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.4177505738332057, |
|
"grad_norm": 0.25335712662759613, |
|
"learning_rate": 2.1951945813279306e-05, |
|
"loss": 0.3547, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.433052792654935, |
|
"grad_norm": 0.24937746984746909, |
|
"learning_rate": 2.0850351157074598e-05, |
|
"loss": 0.353, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.4483550114766643, |
|
"grad_norm": 0.26237405083230225, |
|
"learning_rate": 1.9773888823202747e-05, |
|
"loss": 0.3579, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.4636572302983932, |
|
"grad_norm": 0.24937407019841143, |
|
"learning_rate": 1.8722900619855577e-05, |
|
"loss": 0.3562, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4789594491201226, |
|
"grad_norm": 0.2497739493217595, |
|
"learning_rate": 1.7697720266445374e-05, |
|
"loss": 0.3512, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4942616679418514, |
|
"grad_norm": 0.250399923768968, |
|
"learning_rate": 1.6698673287639242e-05, |
|
"loss": 0.3556, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.5095638867635808, |
|
"grad_norm": 0.2538234603963528, |
|
"learning_rate": 1.5726076909995525e-05, |
|
"loss": 0.355, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.5248661055853097, |
|
"grad_norm": 0.2504434686740776, |
|
"learning_rate": 1.4780239961235143e-05, |
|
"loss": 0.3581, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.540168324407039, |
|
"grad_norm": 0.2537006354023553, |
|
"learning_rate": 1.3861462772179735e-05, |
|
"loss": 0.3529, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.5554705432287683, |
|
"grad_norm": 0.2560925692389684, |
|
"learning_rate": 1.297003708138792e-05, |
|
"loss": 0.3584, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.570772762050497, |
|
"grad_norm": 0.25340005468102866, |
|
"learning_rate": 1.2106245942519745e-05, |
|
"loss": 0.3562, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.5860749808722265, |
|
"grad_norm": 0.25785387176401525, |
|
"learning_rate": 1.1270363634458903e-05, |
|
"loss": 0.3541, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.601377199693956, |
|
"grad_norm": 0.2615280811952495, |
|
"learning_rate": 1.0462655574221213e-05, |
|
"loss": 0.3553, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.6166794185156848, |
|
"grad_norm": 0.24663819526398556, |
|
"learning_rate": 9.683378232676965e-06, |
|
"loss": 0.3532, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.631981637337414, |
|
"grad_norm": 0.2614751928374674, |
|
"learning_rate": 8.932779053113893e-06, |
|
"loss": 0.3546, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.647283856159143, |
|
"grad_norm": 0.25884670404819016, |
|
"learning_rate": 8.211096372666783e-06, |
|
"loss": 0.3559, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.6625860749808723, |
|
"grad_norm": 0.26303763365287397, |
|
"learning_rate": 7.518559346638432e-06, |
|
"loss": 0.3556, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.677888293802601, |
|
"grad_norm": 0.2591833975916801, |
|
"learning_rate": 6.855387875736152e-06, |
|
"loss": 0.3577, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.6931905126243305, |
|
"grad_norm": 0.24688502674320847, |
|
"learning_rate": 6.221792536246973e-06, |
|
"loss": 0.3557, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.70849273144606, |
|
"grad_norm": 0.26100100849151864, |
|
"learning_rate": 5.617974513173341e-06, |
|
"loss": 0.3548, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.7237949502677887, |
|
"grad_norm": 0.2699346579349574, |
|
"learning_rate": 5.044125536351196e-06, |
|
"loss": 0.3511, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.739097169089518, |
|
"grad_norm": 0.2717440390716044, |
|
"learning_rate": 4.500427819570097e-06, |
|
"loss": 0.3494, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.7543993879112474, |
|
"grad_norm": 0.2579283254483208, |
|
"learning_rate": 3.987054002714952e-06, |
|
"loss": 0.3569, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.7697016067329763, |
|
"grad_norm": 0.2594723269046269, |
|
"learning_rate": 3.504167096947952e-06, |
|
"loss": 0.3516, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.785003825554705, |
|
"grad_norm": 0.256455133609715, |
|
"learning_rate": 3.051920432947664e-06, |
|
"loss": 0.3512, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.8003060443764345, |
|
"grad_norm": 0.2621246657840175, |
|
"learning_rate": 2.6304576122221035e-06, |
|
"loss": 0.3527, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.815608263198164, |
|
"grad_norm": 0.26278995363196544, |
|
"learning_rate": 2.2399124615110846e-06, |
|
"loss": 0.3518, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.8309104820198927, |
|
"grad_norm": 0.27091602282401345, |
|
"learning_rate": 1.880408990292315e-06, |
|
"loss": 0.3581, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.846212700841622, |
|
"grad_norm": 0.25572786473847814, |
|
"learning_rate": 1.5520613514047655e-06, |
|
"loss": 0.3562, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.8615149196633514, |
|
"grad_norm": 0.26244627787054337, |
|
"learning_rate": 1.2549738048017846e-06, |
|
"loss": 0.3552, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.8768171384850802, |
|
"grad_norm": 0.26131103676147655, |
|
"learning_rate": 9.892406844456026e-07, |
|
"loss": 0.352, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.8921193573068096, |
|
"grad_norm": 0.27449669022834305, |
|
"learning_rate": 7.549463683534374e-07, |
|
"loss": 0.3504, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.907421576128539, |
|
"grad_norm": 0.264605800831863, |
|
"learning_rate": 5.521652518051368e-07, |
|
"loss": 0.3515, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.922723794950268, |
|
"grad_norm": 0.2618398016329481, |
|
"learning_rate": 3.809617237203744e-07, |
|
"loss": 0.3561, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.9380260137719967, |
|
"grad_norm": 0.26831015620903936, |
|
"learning_rate": 2.4139014621340494e-07, |
|
"loss": 0.3527, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.953328232593726, |
|
"grad_norm": 0.26666302239559175, |
|
"learning_rate": 1.334948373314493e-07, |
|
"loss": 0.3562, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.9686304514154553, |
|
"grad_norm": 0.262359494195635, |
|
"learning_rate": 5.7310056982418094e-08, |
|
"loss": 0.355, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9839326702371842, |
|
"grad_norm": 0.2576422839291312, |
|
"learning_rate": 1.2859996056402423e-08, |
|
"loss": 0.3531, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.9977046671767407, |
|
"step": 1959, |
|
"total_flos": 3660773289099264.0, |
|
"train_loss": 0.4337311900958168, |
|
"train_runtime": 5331.8775, |
|
"train_samples_per_second": 5.882, |
|
"train_steps_per_second": 0.367 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1959, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3660773289099264.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|