{ "best_metric": 0.34750062227249146, "best_model_checkpoint": "../../saves/LLaMA3-70B-qlora-bnb/lora/sft/A61K/checkpoint-100", "epoch": 2.9925925925925925, "eval_steps": 100, "global_step": 606, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04938271604938271, "grad_norm": 15.011398315429688, "learning_rate": 1.1999999999999999e-05, "loss": 12.2879, "step": 10 }, { "epoch": 0.09876543209876543, "grad_norm": 17.572187423706055, "learning_rate": 3.5999999999999994e-05, "loss": 11.973, "step": 20 }, { "epoch": 0.14814814814814814, "grad_norm": 45.80781936645508, "learning_rate": 6.599999999999999e-05, "loss": 9.264, "step": 30 }, { "epoch": 0.19753086419753085, "grad_norm": 14.461894989013672, "learning_rate": 9.599999999999999e-05, "loss": 3.6258, "step": 40 }, { "epoch": 0.24691358024691357, "grad_norm": 10.816905975341797, "learning_rate": 0.00012599999999999997, "loss": 0.5542, "step": 50 }, { "epoch": 0.2962962962962963, "grad_norm": 4.389413356781006, "learning_rate": 0.000156, "loss": 0.4193, "step": 60 }, { "epoch": 0.345679012345679, "grad_norm": 6.623525619506836, "learning_rate": 0.000186, "loss": 0.5134, "step": 70 }, { "epoch": 0.3950617283950617, "grad_norm": 2.265923023223877, "learning_rate": 0.00021599999999999996, "loss": 0.4182, "step": 80 }, { "epoch": 0.4444444444444444, "grad_norm": 2.2424216270446777, "learning_rate": 0.00024599999999999996, "loss": 0.3526, "step": 90 }, { "epoch": 0.49382716049382713, "grad_norm": 4.1003804206848145, "learning_rate": 0.000276, "loss": 0.3573, "step": 100 }, { "epoch": 0.49382716049382713, "eval_loss": 0.34750062227249146, "eval_runtime": 641.1941, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 100 }, { "epoch": 0.5432098765432098, "grad_norm": 4.202010154724121, "learning_rate": 0.00029998843583216637, "loss": 0.3412, "step": 110 }, { "epoch": 0.5925925925925926, "grad_norm": 7.416792869567871, "learning_rate": 0.000299583877149169, "loss": 0.4443, "step": 120 }, { "epoch": 0.6419753086419753, "grad_norm": 4.8874101638793945, "learning_rate": 0.0002986028919054496, "loss": 0.3789, "step": 130 }, { "epoch": 0.691358024691358, "grad_norm": 17.96492576599121, "learning_rate": 0.0002970492603610264, "loss": 0.4358, "step": 140 }, { "epoch": 0.7407407407407407, "grad_norm": 15.250304222106934, "learning_rate": 0.0002949289694879236, "loss": 0.4073, "step": 150 }, { "epoch": 0.7901234567901234, "grad_norm": 2.9382853507995605, "learning_rate": 0.00029225018989917134, "loss": 0.3782, "step": 160 }, { "epoch": 0.8395061728395061, "grad_norm": 9.713154792785645, "learning_rate": 0.00028902324436306994, "loss": 0.3941, "step": 170 }, { "epoch": 0.8888888888888888, "grad_norm": 2.8005290031433105, "learning_rate": 0.00028526056802405104, "loss": 0.345, "step": 180 }, { "epoch": 0.9382716049382716, "grad_norm": 179.18141174316406, "learning_rate": 0.0002809766604834258, "loss": 0.5958, "step": 190 }, { "epoch": 0.9876543209876543, "grad_norm": 3.8531386852264404, "learning_rate": 0.0002761880299246772, "loss": 0.3841, "step": 200 }, { "epoch": 0.9876543209876543, "eval_loss": 0.3884469270706177, "eval_runtime": 641.004, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 200 }, { "epoch": 1.037037037037037, "grad_norm": 5.21042537689209, "learning_rate": 0.0002709131294986136, "loss": 0.38, "step": 210 }, { "epoch": 1.0864197530864197, "grad_norm": 2.059401035308838, "learning_rate": 0.0002651722862135245, "loss": 0.3495, "step": 220 }, { "epoch": 1.1358024691358024, "grad_norm": 9.867389678955078, "learning_rate": 0.00025898762260436153, "loss": 0.4585, "step": 230 }, { "epoch": 1.1851851851851851, "grad_norm": 82.30119323730469, "learning_rate": 0.0002523829714827981, "loss": 0.4752, "step": 240 }, { "epoch": 1.2345679012345678, "grad_norm": 3.674161672592163, "learning_rate": 0.000245383784096678, "loss": 0.4348, "step": 250 }, { "epoch": 1.2839506172839505, "grad_norm": 18.10048484802246, "learning_rate": 0.00023801703205276613, "loss": 0.4606, "step": 260 }, { "epoch": 1.3333333333333333, "grad_norm": 4.7857232093811035, "learning_rate": 0.00023031110338074388, "loss": 0.4063, "step": 270 }, { "epoch": 1.382716049382716, "grad_norm": 4.155650615692139, "learning_rate": 0.00022229569313897066, "loss": 0.4185, "step": 280 }, { "epoch": 1.4320987654320987, "grad_norm": 19.67816734313965, "learning_rate": 0.00021400168898356626, "loss": 0.4242, "step": 290 }, { "epoch": 1.4814814814814814, "grad_norm": 6.764120101928711, "learning_rate": 0.00020546105214177678, "loss": 0.3648, "step": 300 }, { "epoch": 1.4814814814814814, "eval_loss": 0.36776283383369446, "eval_runtime": 640.9052, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 300 }, { "epoch": 1.5308641975308643, "grad_norm": 1.9953991174697876, "learning_rate": 0.0001967066942482978, "loss": 0.3639, "step": 310 }, { "epoch": 1.5802469135802468, "grad_norm": 4.738420486450195, "learning_rate": 0.00018777235051917025, "loss": 0.3831, "step": 320 }, { "epoch": 1.6296296296296298, "grad_norm": 4.011212348937988, "learning_rate": 0.00017869244975197748, "loss": 0.3603, "step": 330 }, { "epoch": 1.6790123456790123, "grad_norm": 1.691756248474121, "learning_rate": 0.00016950198165330198, "loss": 0.3657, "step": 340 }, { "epoch": 1.7283950617283952, "grad_norm": 4.096136093139648, "learning_rate": 0.00016023636200470065, "loss": 0.3608, "step": 350 }, { "epoch": 1.7777777777777777, "grad_norm": 3.374934673309326, "learning_rate": 0.00015093129618678526, "loss": 0.3644, "step": 360 }, { "epoch": 1.8271604938271606, "grad_norm": 2.8539466857910156, "learning_rate": 0.0001416226415873234, "loss": 0.393, "step": 370 }, { "epoch": 1.876543209876543, "grad_norm": 6.184209823608398, "learning_rate": 0.00013234626942357447, "loss": 0.3979, "step": 380 }, { "epoch": 1.925925925925926, "grad_norm": 8.18782901763916, "learning_rate": 0.00012313792651133325, "loss": 0.5438, "step": 390 }, { "epoch": 1.9753086419753085, "grad_norm": 0.582004964351654, "learning_rate": 0.00011403309751335898, "loss": 0.3604, "step": 400 }, { "epoch": 1.9753086419753085, "eval_loss": 0.3583581745624542, "eval_runtime": 640.2927, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 400 }, { "epoch": 2.0246913580246915, "grad_norm": 3.920786142349243, "learning_rate": 0.00010506686819801978, "loss": 0.3926, "step": 410 }, { "epoch": 2.074074074074074, "grad_norm": 2.3339221477508545, "learning_rate": 9.627379023509041e-05, "loss": 0.3697, "step": 420 }, { "epoch": 2.123456790123457, "grad_norm": 0.7499105334281921, "learning_rate": 8.768774804971705e-05, "loss": 0.3496, "step": 430 }, { "epoch": 2.1728395061728394, "grad_norm": 0.7011772990226746, "learning_rate": 7.934182824763187e-05, "loss": 0.3602, "step": 440 }, { "epoch": 2.2222222222222223, "grad_norm": 0.9641762375831604, "learning_rate": 7.126819211479209e-05, "loss": 0.3549, "step": 450 }, { "epoch": 2.271604938271605, "grad_norm": 0.6399113535881042, "learning_rate": 6.349795168276994e-05, "loss": 0.3675, "step": 460 }, { "epoch": 2.3209876543209877, "grad_norm": 0.7869608998298645, "learning_rate": 5.6061049837480616e-05, "loss": 0.352, "step": 470 }, { "epoch": 2.3703703703703702, "grad_norm": 0.6829902529716492, "learning_rate": 4.898614493325209e-05, "loss": 0.3455, "step": 480 }, { "epoch": 2.419753086419753, "grad_norm": 1.2732656002044678, "learning_rate": 4.2300500356881895e-05, "loss": 0.3496, "step": 490 }, { "epoch": 2.4691358024691357, "grad_norm": 0.9332154989242554, "learning_rate": 3.602987946724803e-05, "loss": 0.3435, "step": 500 }, { "epoch": 2.4691358024691357, "eval_loss": 0.3569886386394501, "eval_runtime": 640.5499, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 500 }, { "epoch": 2.5185185185185186, "grad_norm": 1.809404969215393, "learning_rate": 3.0198446315329134e-05, "loss": 0.3585, "step": 510 }, { "epoch": 2.567901234567901, "grad_norm": 0.32072120904922485, "learning_rate": 2.482867252721145e-05, "loss": 0.346, "step": 520 }, { "epoch": 2.617283950617284, "grad_norm": 1.431264042854309, "learning_rate": 1.9941250708913388e-05, "loss": 0.3658, "step": 530 }, { "epoch": 2.6666666666666665, "grad_norm": 2.0725672245025635, "learning_rate": 1.5555014706723723e-05, "loss": 0.3526, "step": 540 }, { "epoch": 2.7160493827160495, "grad_norm": 1.5036245584487915, "learning_rate": 1.1686867030334379e-05, "loss": 0.3455, "step": 550 }, { "epoch": 2.765432098765432, "grad_norm": 0.9321132898330688, "learning_rate": 8.351713718443865e-06, "loss": 0.3625, "step": 560 }, { "epoch": 2.814814814814815, "grad_norm": 0.8139396905899048, "learning_rate": 5.56240689783013e-06, "loss": 0.3428, "step": 570 }, { "epoch": 2.8641975308641974, "grad_norm": 0.6848800778388977, "learning_rate": 3.3296952572425205e-06, "loss": 0.3431, "step": 580 }, { "epoch": 2.9135802469135803, "grad_norm": 1.2402377128601074, "learning_rate": 1.6621826269641315e-06, "loss": 0.3438, "step": 590 }, { "epoch": 2.962962962962963, "grad_norm": 0.6294095516204834, "learning_rate": 5.662948236587972e-07, "loss": 0.3379, "step": 600 }, { "epoch": 2.962962962962963, "eval_loss": 0.34963592886924744, "eval_runtime": 640.5576, "eval_samples_per_second": 0.281, "eval_steps_per_second": 0.281, "step": 600 }, { "epoch": 2.9925925925925925, "step": 606, "total_flos": 1.4871334589840622e+19, "train_loss": 0.9772530333830578, "train_runtime": 45545.0998, "train_samples_per_second": 0.107, "train_steps_per_second": 0.013 } ], "logging_steps": 10, "max_steps": 606, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 1.4871334589840622e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }