{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1173, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02557544757033248, "grad_norm": 1.4732151620168266, "learning_rate": 5e-06, "loss": 0.8857, "step": 10 }, { "epoch": 0.05115089514066496, "grad_norm": 2.487021273446104, "learning_rate": 5e-06, "loss": 0.7797, "step": 20 }, { "epoch": 0.07672634271099744, "grad_norm": 3.1294161768477546, "learning_rate": 5e-06, "loss": 0.7563, "step": 30 }, { "epoch": 0.10230179028132992, "grad_norm": 0.8287185562040077, "learning_rate": 5e-06, "loss": 0.7317, "step": 40 }, { "epoch": 0.1278772378516624, "grad_norm": 0.8396366204298203, "learning_rate": 5e-06, "loss": 0.7124, "step": 50 }, { "epoch": 0.1534526854219949, "grad_norm": 0.675481475401149, "learning_rate": 5e-06, "loss": 0.7089, "step": 60 }, { "epoch": 0.17902813299232737, "grad_norm": 0.8292154979182814, "learning_rate": 5e-06, "loss": 0.6992, "step": 70 }, { "epoch": 0.20460358056265984, "grad_norm": 0.786706122696529, "learning_rate": 5e-06, "loss": 0.6922, "step": 80 }, { "epoch": 0.23017902813299232, "grad_norm": 0.6411055415388162, "learning_rate": 5e-06, "loss": 0.6885, "step": 90 }, { "epoch": 0.2557544757033248, "grad_norm": 0.5992551668740695, "learning_rate": 5e-06, "loss": 0.6729, "step": 100 }, { "epoch": 0.2813299232736573, "grad_norm": 0.6680597868527147, "learning_rate": 5e-06, "loss": 0.6687, "step": 110 }, { "epoch": 0.3069053708439898, "grad_norm": 0.5180401044360867, "learning_rate": 5e-06, "loss": 0.6746, "step": 120 }, { "epoch": 0.33248081841432225, "grad_norm": 0.7933457420559259, "learning_rate": 5e-06, "loss": 0.6763, "step": 130 }, { "epoch": 0.35805626598465473, "grad_norm": 0.6077558627807389, "learning_rate": 5e-06, "loss": 0.6653, "step": 140 }, { "epoch": 0.3836317135549872, "grad_norm": 0.5214150563798917, "learning_rate": 5e-06, "loss": 0.6637, "step": 150 }, { "epoch": 0.4092071611253197, "grad_norm": 0.4782424652477103, "learning_rate": 5e-06, "loss": 0.6743, "step": 160 }, { "epoch": 0.43478260869565216, "grad_norm": 0.5531417892206768, "learning_rate": 5e-06, "loss": 0.663, "step": 170 }, { "epoch": 0.46035805626598464, "grad_norm": 0.6140295950380247, "learning_rate": 5e-06, "loss": 0.6627, "step": 180 }, { "epoch": 0.4859335038363171, "grad_norm": 0.452088230614636, "learning_rate": 5e-06, "loss": 0.6565, "step": 190 }, { "epoch": 0.5115089514066496, "grad_norm": 0.5771504479981889, "learning_rate": 5e-06, "loss": 0.6544, "step": 200 }, { "epoch": 0.5370843989769821, "grad_norm": 0.5981768742786622, "learning_rate": 5e-06, "loss": 0.6648, "step": 210 }, { "epoch": 0.5626598465473146, "grad_norm": 0.6172970384456833, "learning_rate": 5e-06, "loss": 0.6531, "step": 220 }, { "epoch": 0.5882352941176471, "grad_norm": 0.4782698820225656, "learning_rate": 5e-06, "loss": 0.6564, "step": 230 }, { "epoch": 0.6138107416879796, "grad_norm": 0.7221555632733867, "learning_rate": 5e-06, "loss": 0.6472, "step": 240 }, { "epoch": 0.639386189258312, "grad_norm": 0.6008946634556908, "learning_rate": 5e-06, "loss": 0.6586, "step": 250 }, { "epoch": 0.6649616368286445, "grad_norm": 0.4981912776738461, "learning_rate": 5e-06, "loss": 0.6433, "step": 260 }, { "epoch": 0.690537084398977, "grad_norm": 0.4692459483664614, "learning_rate": 5e-06, "loss": 0.6504, "step": 270 }, { "epoch": 0.7161125319693095, "grad_norm": 0.4193700978332349, "learning_rate": 5e-06, "loss": 0.6484, "step": 280 }, { "epoch": 0.7416879795396419, "grad_norm": 0.5445412936907831, "learning_rate": 5e-06, "loss": 0.6482, "step": 290 }, { "epoch": 0.7672634271099744, "grad_norm": 0.657805783671651, "learning_rate": 5e-06, "loss": 0.6488, "step": 300 }, { "epoch": 0.7928388746803069, "grad_norm": 0.5488970184879203, "learning_rate": 5e-06, "loss": 0.6474, "step": 310 }, { "epoch": 0.8184143222506394, "grad_norm": 0.5409806471796, "learning_rate": 5e-06, "loss": 0.6489, "step": 320 }, { "epoch": 0.8439897698209718, "grad_norm": 0.5596800096190868, "learning_rate": 5e-06, "loss": 0.6435, "step": 330 }, { "epoch": 0.8695652173913043, "grad_norm": 0.4643925302456495, "learning_rate": 5e-06, "loss": 0.6443, "step": 340 }, { "epoch": 0.8951406649616368, "grad_norm": 0.6375102071117671, "learning_rate": 5e-06, "loss": 0.6409, "step": 350 }, { "epoch": 0.9207161125319693, "grad_norm": 0.4775041999632585, "learning_rate": 5e-06, "loss": 0.6453, "step": 360 }, { "epoch": 0.9462915601023018, "grad_norm": 0.49604882534300143, "learning_rate": 5e-06, "loss": 0.644, "step": 370 }, { "epoch": 0.9718670076726342, "grad_norm": 0.45731037425025095, "learning_rate": 5e-06, "loss": 0.6414, "step": 380 }, { "epoch": 0.9974424552429667, "grad_norm": 0.4573950197808899, "learning_rate": 5e-06, "loss": 0.6452, "step": 390 }, { "epoch": 1.0, "eval_loss": 0.6434539556503296, "eval_runtime": 38.9026, "eval_samples_per_second": 270.779, "eval_steps_per_second": 1.08, "step": 391 }, { "epoch": 1.0230179028132993, "grad_norm": 0.7487458246165611, "learning_rate": 5e-06, "loss": 0.6099, "step": 400 }, { "epoch": 1.0485933503836318, "grad_norm": 0.4958215078357475, "learning_rate": 5e-06, "loss": 0.6049, "step": 410 }, { "epoch": 1.0741687979539642, "grad_norm": 0.5433206872781998, "learning_rate": 5e-06, "loss": 0.6013, "step": 420 }, { "epoch": 1.0997442455242967, "grad_norm": 0.5662951930987719, "learning_rate": 5e-06, "loss": 0.5995, "step": 430 }, { "epoch": 1.1253196930946292, "grad_norm": 0.4918784496618239, "learning_rate": 5e-06, "loss": 0.6027, "step": 440 }, { "epoch": 1.1508951406649617, "grad_norm": 0.5467386014101446, "learning_rate": 5e-06, "loss": 0.6103, "step": 450 }, { "epoch": 1.1764705882352942, "grad_norm": 0.7179950207720284, "learning_rate": 5e-06, "loss": 0.5999, "step": 460 }, { "epoch": 1.2020460358056266, "grad_norm": 0.5960679117502871, "learning_rate": 5e-06, "loss": 0.6058, "step": 470 }, { "epoch": 1.227621483375959, "grad_norm": 0.5020974407898211, "learning_rate": 5e-06, "loss": 0.6013, "step": 480 }, { "epoch": 1.2531969309462916, "grad_norm": 0.5031636785755809, "learning_rate": 5e-06, "loss": 0.5985, "step": 490 }, { "epoch": 1.278772378516624, "grad_norm": 0.5506207838193202, "learning_rate": 5e-06, "loss": 0.5971, "step": 500 }, { "epoch": 1.3043478260869565, "grad_norm": 0.5156198566972936, "learning_rate": 5e-06, "loss": 0.6043, "step": 510 }, { "epoch": 1.329923273657289, "grad_norm": 0.5484894340051472, "learning_rate": 5e-06, "loss": 0.6089, "step": 520 }, { "epoch": 1.3554987212276215, "grad_norm": 0.47915499629457314, "learning_rate": 5e-06, "loss": 0.6016, "step": 530 }, { "epoch": 1.381074168797954, "grad_norm": 0.8192433363651986, "learning_rate": 5e-06, "loss": 0.5987, "step": 540 }, { "epoch": 1.4066496163682864, "grad_norm": 0.49011715250049237, "learning_rate": 5e-06, "loss": 0.6009, "step": 550 }, { "epoch": 1.432225063938619, "grad_norm": 0.7466359785733482, "learning_rate": 5e-06, "loss": 0.5974, "step": 560 }, { "epoch": 1.4578005115089514, "grad_norm": 0.5087738931925487, "learning_rate": 5e-06, "loss": 0.6009, "step": 570 }, { "epoch": 1.4833759590792839, "grad_norm": 0.6609977172284965, "learning_rate": 5e-06, "loss": 0.5994, "step": 580 }, { "epoch": 1.5089514066496164, "grad_norm": 0.503456913727176, "learning_rate": 5e-06, "loss": 0.5948, "step": 590 }, { "epoch": 1.5345268542199488, "grad_norm": 0.5213824233459637, "learning_rate": 5e-06, "loss": 0.6072, "step": 600 }, { "epoch": 1.5601023017902813, "grad_norm": 0.45319447189866463, "learning_rate": 5e-06, "loss": 0.5915, "step": 610 }, { "epoch": 1.5856777493606138, "grad_norm": 0.457417614147174, "learning_rate": 5e-06, "loss": 0.5879, "step": 620 }, { "epoch": 1.6112531969309463, "grad_norm": 0.5224900055650589, "learning_rate": 5e-06, "loss": 0.6002, "step": 630 }, { "epoch": 1.6368286445012787, "grad_norm": 0.4973950787447109, "learning_rate": 5e-06, "loss": 0.5946, "step": 640 }, { "epoch": 1.6624040920716112, "grad_norm": 0.4790552677041308, "learning_rate": 5e-06, "loss": 0.589, "step": 650 }, { "epoch": 1.6879795396419437, "grad_norm": 0.47449533507634256, "learning_rate": 5e-06, "loss": 0.5994, "step": 660 }, { "epoch": 1.7135549872122762, "grad_norm": 0.5393859616293808, "learning_rate": 5e-06, "loss": 0.6011, "step": 670 }, { "epoch": 1.7391304347826086, "grad_norm": 0.42668160107621367, "learning_rate": 5e-06, "loss": 0.5943, "step": 680 }, { "epoch": 1.7647058823529411, "grad_norm": 0.5017031802880021, "learning_rate": 5e-06, "loss": 0.597, "step": 690 }, { "epoch": 1.7902813299232738, "grad_norm": 0.500629762210993, "learning_rate": 5e-06, "loss": 0.6012, "step": 700 }, { "epoch": 1.815856777493606, "grad_norm": 0.45045745123083464, "learning_rate": 5e-06, "loss": 0.5996, "step": 710 }, { "epoch": 1.8414322250639388, "grad_norm": 0.6404335062666666, "learning_rate": 5e-06, "loss": 0.6001, "step": 720 }, { "epoch": 1.867007672634271, "grad_norm": 0.48025967143483794, "learning_rate": 5e-06, "loss": 0.5995, "step": 730 }, { "epoch": 1.8925831202046037, "grad_norm": 0.5055252053795981, "learning_rate": 5e-06, "loss": 0.5896, "step": 740 }, { "epoch": 1.918158567774936, "grad_norm": 0.6653435248566397, "learning_rate": 5e-06, "loss": 0.603, "step": 750 }, { "epoch": 1.9437340153452687, "grad_norm": 0.6727260672608437, "learning_rate": 5e-06, "loss": 0.6036, "step": 760 }, { "epoch": 1.969309462915601, "grad_norm": 0.5907496423463581, "learning_rate": 5e-06, "loss": 0.6086, "step": 770 }, { "epoch": 1.9948849104859336, "grad_norm": 0.5760288384496973, "learning_rate": 5e-06, "loss": 0.5983, "step": 780 }, { "epoch": 2.0, "eval_loss": 0.6335848569869995, "eval_runtime": 38.2748, "eval_samples_per_second": 275.22, "eval_steps_per_second": 1.097, "step": 782 }, { "epoch": 2.020460358056266, "grad_norm": 0.5888157228328489, "learning_rate": 5e-06, "loss": 0.5672, "step": 790 }, { "epoch": 2.0460358056265986, "grad_norm": 0.5083129678011372, "learning_rate": 5e-06, "loss": 0.5592, "step": 800 }, { "epoch": 2.071611253196931, "grad_norm": 0.48204204488209723, "learning_rate": 5e-06, "loss": 0.5431, "step": 810 }, { "epoch": 2.0971867007672635, "grad_norm": 0.43509395294018177, "learning_rate": 5e-06, "loss": 0.5558, "step": 820 }, { "epoch": 2.122762148337596, "grad_norm": 0.4873856782902337, "learning_rate": 5e-06, "loss": 0.549, "step": 830 }, { "epoch": 2.1483375959079285, "grad_norm": 0.47421724914953955, "learning_rate": 5e-06, "loss": 0.5595, "step": 840 }, { "epoch": 2.1739130434782608, "grad_norm": 0.5324778084056383, "learning_rate": 5e-06, "loss": 0.5584, "step": 850 }, { "epoch": 2.1994884910485935, "grad_norm": 0.6187236206022877, "learning_rate": 5e-06, "loss": 0.5556, "step": 860 }, { "epoch": 2.2250639386189257, "grad_norm": 0.5533284770550284, "learning_rate": 5e-06, "loss": 0.5595, "step": 870 }, { "epoch": 2.2506393861892584, "grad_norm": 0.5115512932458932, "learning_rate": 5e-06, "loss": 0.5541, "step": 880 }, { "epoch": 2.2762148337595907, "grad_norm": 0.8951470613775953, "learning_rate": 5e-06, "loss": 0.5597, "step": 890 }, { "epoch": 2.3017902813299234, "grad_norm": 0.48320795887005585, "learning_rate": 5e-06, "loss": 0.5585, "step": 900 }, { "epoch": 2.3273657289002556, "grad_norm": 0.5389304781613019, "learning_rate": 5e-06, "loss": 0.553, "step": 910 }, { "epoch": 2.3529411764705883, "grad_norm": 0.5504265312204787, "learning_rate": 5e-06, "loss": 0.5555, "step": 920 }, { "epoch": 2.3785166240409206, "grad_norm": 0.6781391025002083, "learning_rate": 5e-06, "loss": 0.5539, "step": 930 }, { "epoch": 2.4040920716112533, "grad_norm": 0.5284123264856311, "learning_rate": 5e-06, "loss": 0.5596, "step": 940 }, { "epoch": 2.4296675191815855, "grad_norm": 0.573703259910989, "learning_rate": 5e-06, "loss": 0.5575, "step": 950 }, { "epoch": 2.455242966751918, "grad_norm": 0.519429928779352, "learning_rate": 5e-06, "loss": 0.562, "step": 960 }, { "epoch": 2.4808184143222505, "grad_norm": 0.5716653700360348, "learning_rate": 5e-06, "loss": 0.5555, "step": 970 }, { "epoch": 2.506393861892583, "grad_norm": 0.45693028304624417, "learning_rate": 5e-06, "loss": 0.5586, "step": 980 }, { "epoch": 2.531969309462916, "grad_norm": 0.5488274428294824, "learning_rate": 5e-06, "loss": 0.5573, "step": 990 }, { "epoch": 2.557544757033248, "grad_norm": 0.4681111347906789, "learning_rate": 5e-06, "loss": 0.559, "step": 1000 }, { "epoch": 2.5831202046035804, "grad_norm": 0.4588067639273481, "learning_rate": 5e-06, "loss": 0.5663, "step": 1010 }, { "epoch": 2.608695652173913, "grad_norm": 0.6878919689252865, "learning_rate": 5e-06, "loss": 0.5639, "step": 1020 }, { "epoch": 2.634271099744246, "grad_norm": 0.4447545765506167, "learning_rate": 5e-06, "loss": 0.5605, "step": 1030 }, { "epoch": 2.659846547314578, "grad_norm": 0.47570519434711034, "learning_rate": 5e-06, "loss": 0.5629, "step": 1040 }, { "epoch": 2.6854219948849103, "grad_norm": 0.4839862926370354, "learning_rate": 5e-06, "loss": 0.5559, "step": 1050 }, { "epoch": 2.710997442455243, "grad_norm": 0.6321260475168335, "learning_rate": 5e-06, "loss": 0.5602, "step": 1060 }, { "epoch": 2.7365728900255757, "grad_norm": 0.660422843874434, "learning_rate": 5e-06, "loss": 0.5647, "step": 1070 }, { "epoch": 2.762148337595908, "grad_norm": 0.6591827642794408, "learning_rate": 5e-06, "loss": 0.5561, "step": 1080 }, { "epoch": 2.78772378516624, "grad_norm": 0.5629633198627261, "learning_rate": 5e-06, "loss": 0.5536, "step": 1090 }, { "epoch": 2.813299232736573, "grad_norm": 0.4679817123851223, "learning_rate": 5e-06, "loss": 0.5579, "step": 1100 }, { "epoch": 2.8388746803069056, "grad_norm": 0.5611457522346606, "learning_rate": 5e-06, "loss": 0.5629, "step": 1110 }, { "epoch": 2.864450127877238, "grad_norm": 0.4877999696633639, "learning_rate": 5e-06, "loss": 0.5662, "step": 1120 }, { "epoch": 2.89002557544757, "grad_norm": 0.48460006644169323, "learning_rate": 5e-06, "loss": 0.5622, "step": 1130 }, { "epoch": 2.915601023017903, "grad_norm": 0.47342607041413165, "learning_rate": 5e-06, "loss": 0.5613, "step": 1140 }, { "epoch": 2.9411764705882355, "grad_norm": 0.4640271559873245, "learning_rate": 5e-06, "loss": 0.5634, "step": 1150 }, { "epoch": 2.9667519181585678, "grad_norm": 0.47736761450925297, "learning_rate": 5e-06, "loss": 0.5549, "step": 1160 }, { "epoch": 2.9923273657289, "grad_norm": 0.5483840655299442, "learning_rate": 5e-06, "loss": 0.5549, "step": 1170 }, { "epoch": 3.0, "eval_loss": 0.6347914934158325, "eval_runtime": 37.6736, "eval_samples_per_second": 279.612, "eval_steps_per_second": 1.115, "step": 1173 }, { "epoch": 3.0, "step": 1173, "total_flos": 1964818688901120.0, "train_loss": 0.6108124574228727, "train_runtime": 7401.7193, "train_samples_per_second": 81.12, "train_steps_per_second": 0.158 } ], "logging_steps": 10, "max_steps": 1173, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1964818688901120.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }