{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 14.020469665527344, "learning_rate": 5e-06, "loss": 0.8991, "step": 10 }, { "epoch": 0.05, "grad_norm": 2.1231062412261963, "learning_rate": 5e-06, "loss": 0.7786, "step": 20 }, { "epoch": 0.075, "grad_norm": 0.8827582597732544, "learning_rate": 5e-06, "loss": 0.7194, "step": 30 }, { "epoch": 0.1, "grad_norm": 0.8402972221374512, "learning_rate": 5e-06, "loss": 0.6925, "step": 40 }, { "epoch": 0.125, "grad_norm": 0.8711609840393066, "learning_rate": 5e-06, "loss": 0.6664, "step": 50 }, { "epoch": 0.15, "grad_norm": 1.020943284034729, "learning_rate": 5e-06, "loss": 0.6468, "step": 60 }, { "epoch": 0.175, "grad_norm": 0.6615628600120544, "learning_rate": 5e-06, "loss": 0.6438, "step": 70 }, { "epoch": 0.2, "grad_norm": 0.6699038743972778, "learning_rate": 5e-06, "loss": 0.6357, "step": 80 }, { "epoch": 0.225, "grad_norm": 0.6830587387084961, "learning_rate": 5e-06, "loss": 0.6206, "step": 90 }, { "epoch": 0.25, "grad_norm": 0.5887433886528015, "learning_rate": 5e-06, "loss": 0.613, "step": 100 }, { "epoch": 0.275, "grad_norm": 0.491720587015152, "learning_rate": 5e-06, "loss": 0.6036, "step": 110 }, { "epoch": 0.3, "grad_norm": 0.6611770391464233, "learning_rate": 5e-06, "loss": 0.6014, "step": 120 }, { "epoch": 0.325, "grad_norm": 0.6511964797973633, "learning_rate": 5e-06, "loss": 0.6006, "step": 130 }, { "epoch": 0.35, "grad_norm": 0.5848156213760376, "learning_rate": 5e-06, "loss": 0.6091, "step": 140 }, { "epoch": 0.375, "grad_norm": 0.6775133609771729, "learning_rate": 5e-06, "loss": 0.6004, "step": 150 }, { "epoch": 0.4, "grad_norm": 0.5474533438682556, "learning_rate": 5e-06, "loss": 0.6068, "step": 160 }, { "epoch": 0.425, "grad_norm": 0.4722808301448822, "learning_rate": 5e-06, "loss": 0.5916, "step": 170 }, { "epoch": 0.45, "grad_norm": 0.5218152403831482, "learning_rate": 5e-06, "loss": 0.5972, "step": 180 }, { "epoch": 0.475, "grad_norm": 0.703032374382019, "learning_rate": 5e-06, "loss": 0.5896, "step": 190 }, { "epoch": 0.5, "grad_norm": 0.6441683769226074, "learning_rate": 5e-06, "loss": 0.592, "step": 200 }, { "epoch": 0.525, "grad_norm": 0.5189586281776428, "learning_rate": 5e-06, "loss": 0.5887, "step": 210 }, { "epoch": 0.55, "grad_norm": 0.5127268433570862, "learning_rate": 5e-06, "loss": 0.5825, "step": 220 }, { "epoch": 0.575, "grad_norm": 0.5413115620613098, "learning_rate": 5e-06, "loss": 0.5804, "step": 230 }, { "epoch": 0.6, "grad_norm": 0.48631197214126587, "learning_rate": 5e-06, "loss": 0.5836, "step": 240 }, { "epoch": 0.625, "grad_norm": 0.554779052734375, "learning_rate": 5e-06, "loss": 0.5831, "step": 250 }, { "epoch": 0.65, "grad_norm": 0.5633807182312012, "learning_rate": 5e-06, "loss": 0.5815, "step": 260 }, { "epoch": 0.675, "grad_norm": 0.5503019690513611, "learning_rate": 5e-06, "loss": 0.5771, "step": 270 }, { "epoch": 0.7, "grad_norm": 0.4752262830734253, "learning_rate": 5e-06, "loss": 0.5778, "step": 280 }, { "epoch": 0.725, "grad_norm": 0.6272807121276855, "learning_rate": 5e-06, "loss": 0.5763, "step": 290 }, { "epoch": 0.75, "grad_norm": 0.6500713229179382, "learning_rate": 5e-06, "loss": 0.573, "step": 300 }, { "epoch": 0.775, "grad_norm": 0.5538522005081177, "learning_rate": 5e-06, "loss": 0.575, "step": 310 }, { "epoch": 0.8, "grad_norm": 0.674211323261261, "learning_rate": 5e-06, "loss": 0.5775, "step": 320 }, { "epoch": 0.825, "grad_norm": 0.5151064991950989, "learning_rate": 5e-06, "loss": 0.5695, "step": 330 }, { "epoch": 0.85, "grad_norm": 0.45805805921554565, "learning_rate": 5e-06, "loss": 0.5658, "step": 340 }, { "epoch": 0.875, "grad_norm": 0.907016932964325, "learning_rate": 5e-06, "loss": 0.5682, "step": 350 }, { "epoch": 0.9, "grad_norm": 0.4508662521839142, "learning_rate": 5e-06, "loss": 0.5691, "step": 360 }, { "epoch": 0.925, "grad_norm": 0.5657577514648438, "learning_rate": 5e-06, "loss": 0.5661, "step": 370 }, { "epoch": 0.95, "grad_norm": 0.4820933938026428, "learning_rate": 5e-06, "loss": 0.5677, "step": 380 }, { "epoch": 0.975, "grad_norm": 0.4836273491382599, "learning_rate": 5e-06, "loss": 0.5654, "step": 390 }, { "epoch": 1.0, "grad_norm": 0.4653589427471161, "learning_rate": 5e-06, "loss": 0.5566, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.5628939270973206, "eval_runtime": 36.7288, "eval_samples_per_second": 292.985, "eval_steps_per_second": 1.171, "step": 400 }, { "epoch": 1.025, "grad_norm": 0.5971388816833496, "learning_rate": 5e-06, "loss": 0.526, "step": 410 }, { "epoch": 1.05, "grad_norm": 0.6474968791007996, "learning_rate": 5e-06, "loss": 0.5298, "step": 420 }, { "epoch": 1.075, "grad_norm": 0.7014959454536438, "learning_rate": 5e-06, "loss": 0.5299, "step": 430 }, { "epoch": 1.1, "grad_norm": 0.7876601219177246, "learning_rate": 5e-06, "loss": 0.5273, "step": 440 }, { "epoch": 1.125, "grad_norm": 0.5993332862854004, "learning_rate": 5e-06, "loss": 0.5274, "step": 450 }, { "epoch": 1.15, "grad_norm": 0.476075142621994, "learning_rate": 5e-06, "loss": 0.5241, "step": 460 }, { "epoch": 1.175, "grad_norm": 0.509283721446991, "learning_rate": 5e-06, "loss": 0.5204, "step": 470 }, { "epoch": 1.2, "grad_norm": 0.5607204437255859, "learning_rate": 5e-06, "loss": 0.524, "step": 480 }, { "epoch": 1.225, "grad_norm": 0.6008129715919495, "learning_rate": 5e-06, "loss": 0.5247, "step": 490 }, { "epoch": 1.25, "grad_norm": 0.5496598482131958, "learning_rate": 5e-06, "loss": 0.5224, "step": 500 }, { "epoch": 1.275, "grad_norm": 0.46281903982162476, "learning_rate": 5e-06, "loss": 0.5223, "step": 510 }, { "epoch": 1.3, "grad_norm": 0.5407223105430603, "learning_rate": 5e-06, "loss": 0.5233, "step": 520 }, { "epoch": 1.325, "grad_norm": 0.6812359690666199, "learning_rate": 5e-06, "loss": 0.5253, "step": 530 }, { "epoch": 1.35, "grad_norm": 0.5700986385345459, "learning_rate": 5e-06, "loss": 0.5198, "step": 540 }, { "epoch": 1.375, "grad_norm": 0.7843871712684631, "learning_rate": 5e-06, "loss": 0.5199, "step": 550 }, { "epoch": 1.4, "grad_norm": 0.5454580187797546, "learning_rate": 5e-06, "loss": 0.5206, "step": 560 }, { "epoch": 1.425, "grad_norm": 0.5220310091972351, "learning_rate": 5e-06, "loss": 0.5213, "step": 570 }, { "epoch": 1.45, "grad_norm": 0.5269823670387268, "learning_rate": 5e-06, "loss": 0.5222, "step": 580 }, { "epoch": 1.475, "grad_norm": 0.5376739501953125, "learning_rate": 5e-06, "loss": 0.5194, "step": 590 }, { "epoch": 1.5, "grad_norm": 0.58550626039505, "learning_rate": 5e-06, "loss": 0.5238, "step": 600 }, { "epoch": 1.525, "grad_norm": 0.5343301296234131, "learning_rate": 5e-06, "loss": 0.515, "step": 610 }, { "epoch": 1.55, "grad_norm": 0.6829107999801636, "learning_rate": 5e-06, "loss": 0.519, "step": 620 }, { "epoch": 1.575, "grad_norm": 0.7822884917259216, "learning_rate": 5e-06, "loss": 0.5103, "step": 630 }, { "epoch": 1.6, "grad_norm": 0.5208476781845093, "learning_rate": 5e-06, "loss": 0.515, "step": 640 }, { "epoch": 1.625, "grad_norm": 0.6014963984489441, "learning_rate": 5e-06, "loss": 0.5162, "step": 650 }, { "epoch": 1.65, "grad_norm": 0.5044267177581787, "learning_rate": 5e-06, "loss": 0.5073, "step": 660 }, { "epoch": 1.675, "grad_norm": 0.6184217929840088, "learning_rate": 5e-06, "loss": 0.5114, "step": 670 }, { "epoch": 1.7, "grad_norm": 0.5359528064727783, "learning_rate": 5e-06, "loss": 0.5074, "step": 680 }, { "epoch": 1.725, "grad_norm": 0.5592532753944397, "learning_rate": 5e-06, "loss": 0.5129, "step": 690 }, { "epoch": 1.75, "grad_norm": 0.6100260615348816, "learning_rate": 5e-06, "loss": 0.5147, "step": 700 }, { "epoch": 1.775, "grad_norm": 0.542510986328125, "learning_rate": 5e-06, "loss": 0.5091, "step": 710 }, { "epoch": 1.8, "grad_norm": 0.5745417475700378, "learning_rate": 5e-06, "loss": 0.5064, "step": 720 }, { "epoch": 1.825, "grad_norm": 0.48998454213142395, "learning_rate": 5e-06, "loss": 0.5114, "step": 730 }, { "epoch": 1.85, "grad_norm": 0.48566123843193054, "learning_rate": 5e-06, "loss": 0.5136, "step": 740 }, { "epoch": 1.875, "grad_norm": 0.5040503740310669, "learning_rate": 5e-06, "loss": 0.5139, "step": 750 }, { "epoch": 1.9, "grad_norm": 0.5267658233642578, "learning_rate": 5e-06, "loss": 0.5099, "step": 760 }, { "epoch": 1.925, "grad_norm": 0.6203451156616211, "learning_rate": 5e-06, "loss": 0.5062, "step": 770 }, { "epoch": 1.95, "grad_norm": 0.5038536787033081, "learning_rate": 5e-06, "loss": 0.5062, "step": 780 }, { "epoch": 1.975, "grad_norm": 0.571147620677948, "learning_rate": 5e-06, "loss": 0.5072, "step": 790 }, { "epoch": 2.0, "grad_norm": 0.5538639426231384, "learning_rate": 5e-06, "loss": 0.5106, "step": 800 }, { "epoch": 2.0, "eval_loss": 0.5380541682243347, "eval_runtime": 36.1734, "eval_samples_per_second": 297.484, "eval_steps_per_second": 1.189, "step": 800 }, { "epoch": 2.025, "grad_norm": 0.5824851989746094, "learning_rate": 5e-06, "loss": 0.4697, "step": 810 }, { "epoch": 2.05, "grad_norm": 0.5304216742515564, "learning_rate": 5e-06, "loss": 0.4702, "step": 820 }, { "epoch": 2.075, "grad_norm": 0.6173574924468994, "learning_rate": 5e-06, "loss": 0.4704, "step": 830 }, { "epoch": 2.1, "grad_norm": 0.5461722016334534, "learning_rate": 5e-06, "loss": 0.4737, "step": 840 }, { "epoch": 2.125, "grad_norm": 0.5816064476966858, "learning_rate": 5e-06, "loss": 0.472, "step": 850 }, { "epoch": 2.15, "grad_norm": 0.5272941589355469, "learning_rate": 5e-06, "loss": 0.47, "step": 860 }, { "epoch": 2.175, "grad_norm": 0.6730207800865173, "learning_rate": 5e-06, "loss": 0.4747, "step": 870 }, { "epoch": 2.2, "grad_norm": 0.5066570043563843, "learning_rate": 5e-06, "loss": 0.4721, "step": 880 }, { "epoch": 2.225, "grad_norm": 0.5733837485313416, "learning_rate": 5e-06, "loss": 0.47, "step": 890 }, { "epoch": 2.25, "grad_norm": 0.5919420719146729, "learning_rate": 5e-06, "loss": 0.4697, "step": 900 }, { "epoch": 2.275, "grad_norm": 0.600871741771698, "learning_rate": 5e-06, "loss": 0.473, "step": 910 }, { "epoch": 2.3, "grad_norm": 0.5913262367248535, "learning_rate": 5e-06, "loss": 0.4734, "step": 920 }, { "epoch": 2.325, "grad_norm": 0.6537840962409973, "learning_rate": 5e-06, "loss": 0.4702, "step": 930 }, { "epoch": 2.35, "grad_norm": 0.4861612915992737, "learning_rate": 5e-06, "loss": 0.4713, "step": 940 }, { "epoch": 2.375, "grad_norm": 0.5109017491340637, "learning_rate": 5e-06, "loss": 0.4714, "step": 950 }, { "epoch": 2.4, "grad_norm": 0.560487687587738, "learning_rate": 5e-06, "loss": 0.4751, "step": 960 }, { "epoch": 2.425, "grad_norm": 0.618090033531189, "learning_rate": 5e-06, "loss": 0.4703, "step": 970 }, { "epoch": 2.45, "grad_norm": 0.5797336101531982, "learning_rate": 5e-06, "loss": 0.4694, "step": 980 }, { "epoch": 2.475, "grad_norm": 0.7067740559577942, "learning_rate": 5e-06, "loss": 0.4695, "step": 990 }, { "epoch": 2.5, "grad_norm": 0.7067424654960632, "learning_rate": 5e-06, "loss": 0.4772, "step": 1000 }, { "epoch": 2.525, "grad_norm": 0.5409600138664246, "learning_rate": 5e-06, "loss": 0.4696, "step": 1010 }, { "epoch": 2.55, "grad_norm": 0.6049436926841736, "learning_rate": 5e-06, "loss": 0.4763, "step": 1020 }, { "epoch": 2.575, "grad_norm": 0.591397225856781, "learning_rate": 5e-06, "loss": 0.4757, "step": 1030 }, { "epoch": 2.6, "grad_norm": 0.5943789482116699, "learning_rate": 5e-06, "loss": 0.475, "step": 1040 }, { "epoch": 2.625, "grad_norm": 0.5674725770950317, "learning_rate": 5e-06, "loss": 0.4732, "step": 1050 }, { "epoch": 2.65, "grad_norm": 0.4904470443725586, "learning_rate": 5e-06, "loss": 0.4734, "step": 1060 }, { "epoch": 2.675, "grad_norm": 0.5225439667701721, "learning_rate": 5e-06, "loss": 0.4795, "step": 1070 }, { "epoch": 2.7, "grad_norm": 0.6196999549865723, "learning_rate": 5e-06, "loss": 0.4751, "step": 1080 }, { "epoch": 2.725, "grad_norm": 0.7215800881385803, "learning_rate": 5e-06, "loss": 0.4696, "step": 1090 }, { "epoch": 2.75, "grad_norm": 0.6502852439880371, "learning_rate": 5e-06, "loss": 0.474, "step": 1100 }, { "epoch": 2.775, "grad_norm": 0.512017011642456, "learning_rate": 5e-06, "loss": 0.4763, "step": 1110 }, { "epoch": 2.8, "grad_norm": 0.5410541296005249, "learning_rate": 5e-06, "loss": 0.4705, "step": 1120 }, { "epoch": 2.825, "grad_norm": 0.5041213631629944, "learning_rate": 5e-06, "loss": 0.469, "step": 1130 }, { "epoch": 2.85, "grad_norm": 0.5700607895851135, "learning_rate": 5e-06, "loss": 0.4749, "step": 1140 }, { "epoch": 2.875, "grad_norm": 0.4994681775569916, "learning_rate": 5e-06, "loss": 0.4711, "step": 1150 }, { "epoch": 2.9, "grad_norm": 0.5657273530960083, "learning_rate": 5e-06, "loss": 0.4714, "step": 1160 }, { "epoch": 2.925, "grad_norm": 0.48754239082336426, "learning_rate": 5e-06, "loss": 0.4718, "step": 1170 }, { "epoch": 2.95, "grad_norm": 0.5921581983566284, "learning_rate": 5e-06, "loss": 0.4729, "step": 1180 }, { "epoch": 2.975, "grad_norm": 0.5031780004501343, "learning_rate": 5e-06, "loss": 0.4747, "step": 1190 }, { "epoch": 3.0, "grad_norm": 0.6396584510803223, "learning_rate": 5e-06, "loss": 0.4685, "step": 1200 }, { "epoch": 3.0, "eval_loss": 0.532584547996521, "eval_runtime": 36.8565, "eval_samples_per_second": 291.97, "eval_steps_per_second": 1.167, "step": 1200 }, { "epoch": 3.0, "step": 1200, "total_flos": 5.666028609022722e+19, "train_loss": 0.5332197288672129, "train_runtime": 8530.7812, "train_samples_per_second": 71.901, "train_steps_per_second": 0.141 } ], "logging_steps": 10, "max_steps": 1200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.666028609022722e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }