|
{ |
|
"best_metric": 0.7410179640718563, |
|
"best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-rsna-2018/checkpoint-1002", |
|
"epoch": 29.820359281437124, |
|
"eval_steps": 500, |
|
"global_step": 2490, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 5.50510835647583, |
|
"learning_rate": 2.0080321285140564e-06, |
|
"loss": 0.7371, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 10.76452350616455, |
|
"learning_rate": 4.016064257028113e-06, |
|
"loss": 0.7203, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 6.238582611083984, |
|
"learning_rate": 6.024096385542169e-06, |
|
"loss": 0.6998, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 6.8226189613342285, |
|
"learning_rate": 8.032128514056226e-06, |
|
"loss": 0.6751, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 8.670806884765625, |
|
"learning_rate": 1.0040160642570281e-05, |
|
"loss": 0.6305, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 9.462418556213379, |
|
"learning_rate": 1.2048192771084338e-05, |
|
"loss": 0.6519, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 4.775774955749512, |
|
"learning_rate": 1.4056224899598394e-05, |
|
"loss": 0.6457, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 5.374617099761963, |
|
"learning_rate": 1.606425702811245e-05, |
|
"loss": 0.6448, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.9940119760479041, |
|
"eval_accuracy": 0.6736526946107785, |
|
"eval_loss": 0.6735118627548218, |
|
"eval_runtime": 4.2254, |
|
"eval_samples_per_second": 158.092, |
|
"eval_steps_per_second": 4.97, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.0778443113772456, |
|
"grad_norm": 9.03003215789795, |
|
"learning_rate": 1.8072289156626505e-05, |
|
"loss": 0.7148, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"grad_norm": 8.153128623962402, |
|
"learning_rate": 2.0080321285140562e-05, |
|
"loss": 0.7236, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.3173652694610778, |
|
"grad_norm": 6.323952674865723, |
|
"learning_rate": 2.208835341365462e-05, |
|
"loss": 0.7318, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 8.060946464538574, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 0.7131, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.55688622754491, |
|
"grad_norm": 8.46826457977295, |
|
"learning_rate": 2.6104417670682734e-05, |
|
"loss": 0.7513, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6766467065868262, |
|
"grad_norm": 6.310857772827148, |
|
"learning_rate": 2.8112449799196788e-05, |
|
"loss": 0.7172, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.7964071856287425, |
|
"grad_norm": 7.297194957733154, |
|
"learning_rate": 3.012048192771085e-05, |
|
"loss": 0.7141, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 5.612814903259277, |
|
"learning_rate": 3.21285140562249e-05, |
|
"loss": 0.736, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.655688622754491, |
|
"eval_loss": 0.6968684196472168, |
|
"eval_runtime": 4.7501, |
|
"eval_samples_per_second": 140.628, |
|
"eval_steps_per_second": 4.421, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 2.035928143712575, |
|
"grad_norm": 6.799407482147217, |
|
"learning_rate": 3.413654618473896e-05, |
|
"loss": 0.7162, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.155688622754491, |
|
"grad_norm": 5.372466087341309, |
|
"learning_rate": 3.614457831325301e-05, |
|
"loss": 0.7359, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.2754491017964074, |
|
"grad_norm": 7.812543869018555, |
|
"learning_rate": 3.815261044176707e-05, |
|
"loss": 0.7076, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"grad_norm": 4.029458522796631, |
|
"learning_rate": 4.0160642570281125e-05, |
|
"loss": 0.7415, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.5149700598802394, |
|
"grad_norm": 4.32454776763916, |
|
"learning_rate": 4.2168674698795186e-05, |
|
"loss": 0.721, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.6347305389221556, |
|
"grad_norm": 4.972012042999268, |
|
"learning_rate": 4.417670682730924e-05, |
|
"loss": 0.7117, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.754491017964072, |
|
"grad_norm": 4.34082555770874, |
|
"learning_rate": 4.61847389558233e-05, |
|
"loss": 0.7245, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.874251497005988, |
|
"grad_norm": 6.3038787841796875, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 0.7116, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"grad_norm": 5.848024368286133, |
|
"learning_rate": 4.9977688531905406e-05, |
|
"loss": 0.6895, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"eval_accuracy": 0.6916167664670658, |
|
"eval_loss": 0.6264948844909668, |
|
"eval_runtime": 5.127, |
|
"eval_samples_per_second": 130.29, |
|
"eval_steps_per_second": 4.096, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.1137724550898205, |
|
"grad_norm": 5.597372055053711, |
|
"learning_rate": 4.97545738509594e-05, |
|
"loss": 0.6981, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.2335329341317367, |
|
"grad_norm": 3.8474349975585938, |
|
"learning_rate": 4.953145917001339e-05, |
|
"loss": 0.6821, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.3532934131736525, |
|
"grad_norm": 4.591520309448242, |
|
"learning_rate": 4.930834448906738e-05, |
|
"loss": 0.6935, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.4730538922155687, |
|
"grad_norm": 7.39481258392334, |
|
"learning_rate": 4.908522980812137e-05, |
|
"loss": 0.7123, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 3.592814371257485, |
|
"grad_norm": 4.609263896942139, |
|
"learning_rate": 4.886211512717537e-05, |
|
"loss": 0.7209, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 3.712574850299401, |
|
"grad_norm": 6.359745502471924, |
|
"learning_rate": 4.8639000446229364e-05, |
|
"loss": 0.6891, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 3.8323353293413174, |
|
"grad_norm": 3.9733974933624268, |
|
"learning_rate": 4.8415885765283355e-05, |
|
"loss": 0.6796, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.9520958083832336, |
|
"grad_norm": 5.370658874511719, |
|
"learning_rate": 4.8192771084337354e-05, |
|
"loss": 0.6631, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7155688622754491, |
|
"eval_loss": 0.627507209777832, |
|
"eval_runtime": 5.0645, |
|
"eval_samples_per_second": 131.899, |
|
"eval_steps_per_second": 4.147, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 4.07185628742515, |
|
"grad_norm": 4.173786163330078, |
|
"learning_rate": 4.7969656403391346e-05, |
|
"loss": 0.6847, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.191616766467066, |
|
"grad_norm": 8.743853569030762, |
|
"learning_rate": 4.774654172244534e-05, |
|
"loss": 0.6653, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.311377245508982, |
|
"grad_norm": 3.8112645149230957, |
|
"learning_rate": 4.7523427041499336e-05, |
|
"loss": 0.682, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 4.431137724550898, |
|
"grad_norm": 4.91067361831665, |
|
"learning_rate": 4.730031236055333e-05, |
|
"loss": 0.6849, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 4.550898203592815, |
|
"grad_norm": 7.876720428466797, |
|
"learning_rate": 4.707719767960732e-05, |
|
"loss": 0.665, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 4.6706586826347305, |
|
"grad_norm": 3.3110787868499756, |
|
"learning_rate": 4.685408299866131e-05, |
|
"loss": 0.6923, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 4.790419161676647, |
|
"grad_norm": 4.038461208343506, |
|
"learning_rate": 4.663096831771531e-05, |
|
"loss": 0.6776, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 4.910179640718563, |
|
"grad_norm": 4.028420448303223, |
|
"learning_rate": 4.64078536367693e-05, |
|
"loss": 0.6725, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 4.994011976047904, |
|
"eval_accuracy": 0.7125748502994012, |
|
"eval_loss": 0.6311057806015015, |
|
"eval_runtime": 5.1897, |
|
"eval_samples_per_second": 128.717, |
|
"eval_steps_per_second": 4.046, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 5.029940119760479, |
|
"grad_norm": 4.9200215339660645, |
|
"learning_rate": 4.61847389558233e-05, |
|
"loss": 0.6848, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.149700598802395, |
|
"grad_norm": 11.384115219116211, |
|
"learning_rate": 4.596162427487729e-05, |
|
"loss": 0.6651, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 5.269461077844311, |
|
"grad_norm": 4.320120811462402, |
|
"learning_rate": 4.5738509593931284e-05, |
|
"loss": 0.6627, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 5.389221556886228, |
|
"grad_norm": 3.2249197959899902, |
|
"learning_rate": 4.5515394912985275e-05, |
|
"loss": 0.695, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 5.508982035928144, |
|
"grad_norm": 6.424835681915283, |
|
"learning_rate": 4.529228023203927e-05, |
|
"loss": 0.665, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 5.62874251497006, |
|
"grad_norm": 3.735926389694214, |
|
"learning_rate": 4.506916555109326e-05, |
|
"loss": 0.6484, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 5.748502994011976, |
|
"grad_norm": 6.180431365966797, |
|
"learning_rate": 4.484605087014726e-05, |
|
"loss": 0.6623, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 5.868263473053892, |
|
"grad_norm": 4.55112886428833, |
|
"learning_rate": 4.4622936189201256e-05, |
|
"loss": 0.6954, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"grad_norm": 5.922323226928711, |
|
"learning_rate": 4.439982150825525e-05, |
|
"loss": 0.6778, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7065868263473054, |
|
"eval_loss": 0.619443953037262, |
|
"eval_runtime": 4.9491, |
|
"eval_samples_per_second": 134.974, |
|
"eval_steps_per_second": 4.243, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 6.107784431137724, |
|
"grad_norm": 4.259535789489746, |
|
"learning_rate": 4.417670682730924e-05, |
|
"loss": 0.6365, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 6.227544910179641, |
|
"grad_norm": 3.808413028717041, |
|
"learning_rate": 4.395359214636323e-05, |
|
"loss": 0.6913, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 6.347305389221557, |
|
"grad_norm": 4.0178632736206055, |
|
"learning_rate": 4.373047746541722e-05, |
|
"loss": 0.7112, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 6.467065868263473, |
|
"grad_norm": 3.5464377403259277, |
|
"learning_rate": 4.350736278447122e-05, |
|
"loss": 0.6287, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 6.586826347305389, |
|
"grad_norm": 5.3495612144470215, |
|
"learning_rate": 4.328424810352521e-05, |
|
"loss": 0.6441, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 6.706586826347305, |
|
"grad_norm": 3.6895763874053955, |
|
"learning_rate": 4.306113342257921e-05, |
|
"loss": 0.6492, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 6.826347305389222, |
|
"grad_norm": 6.391328811645508, |
|
"learning_rate": 4.2838018741633203e-05, |
|
"loss": 0.6347, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 6.946107784431137, |
|
"grad_norm": 3.929858922958374, |
|
"learning_rate": 4.2614904060687195e-05, |
|
"loss": 0.6734, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 6.994011976047904, |
|
"eval_accuracy": 0.7140718562874252, |
|
"eval_loss": 0.602377712726593, |
|
"eval_runtime": 5.4236, |
|
"eval_samples_per_second": 123.166, |
|
"eval_steps_per_second": 3.872, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 7.065868263473054, |
|
"grad_norm": 3.6292643547058105, |
|
"learning_rate": 4.239178937974119e-05, |
|
"loss": 0.6651, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 7.18562874251497, |
|
"grad_norm": 5.194599628448486, |
|
"learning_rate": 4.2168674698795186e-05, |
|
"loss": 0.643, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 7.3053892215568865, |
|
"grad_norm": 4.0095953941345215, |
|
"learning_rate": 4.194556001784918e-05, |
|
"loss": 0.6436, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 7.425149700598802, |
|
"grad_norm": 4.267141819000244, |
|
"learning_rate": 4.172244533690317e-05, |
|
"loss": 0.6114, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 7.544910179640718, |
|
"grad_norm": 3.5301904678344727, |
|
"learning_rate": 4.149933065595716e-05, |
|
"loss": 0.6326, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 7.664670658682635, |
|
"grad_norm": 3.8866894245147705, |
|
"learning_rate": 4.127621597501116e-05, |
|
"loss": 0.6331, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 7.7844311377245505, |
|
"grad_norm": 3.902667284011841, |
|
"learning_rate": 4.105310129406515e-05, |
|
"loss": 0.6515, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 7.904191616766467, |
|
"grad_norm": 4.829390525817871, |
|
"learning_rate": 4.082998661311915e-05, |
|
"loss": 0.6231, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7230538922155688, |
|
"eval_loss": 0.6081866025924683, |
|
"eval_runtime": 5.7887, |
|
"eval_samples_per_second": 115.397, |
|
"eval_steps_per_second": 3.628, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 8.023952095808383, |
|
"grad_norm": 4.755038261413574, |
|
"learning_rate": 4.060687193217314e-05, |
|
"loss": 0.6261, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 8.1437125748503, |
|
"grad_norm": 3.4586455821990967, |
|
"learning_rate": 4.038375725122713e-05, |
|
"loss": 0.6534, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 8.263473053892216, |
|
"grad_norm": 8.194857597351074, |
|
"learning_rate": 4.0160642570281125e-05, |
|
"loss": 0.6329, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 8.383233532934131, |
|
"grad_norm": 2.9734175205230713, |
|
"learning_rate": 3.993752788933512e-05, |
|
"loss": 0.6676, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 8.502994011976048, |
|
"grad_norm": 5.66069221496582, |
|
"learning_rate": 3.9714413208389115e-05, |
|
"loss": 0.6178, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 8.622754491017965, |
|
"grad_norm": 3.4166412353515625, |
|
"learning_rate": 3.949129852744311e-05, |
|
"loss": 0.6399, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 8.74251497005988, |
|
"grad_norm": 5.076518535614014, |
|
"learning_rate": 3.9268183846497105e-05, |
|
"loss": 0.6163, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 8.862275449101796, |
|
"grad_norm": 3.32446551322937, |
|
"learning_rate": 3.90450691655511e-05, |
|
"loss": 0.6152, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 8.982035928143713, |
|
"grad_norm": 4.711836338043213, |
|
"learning_rate": 3.882195448460509e-05, |
|
"loss": 0.6164, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 8.994011976047904, |
|
"eval_accuracy": 0.7170658682634731, |
|
"eval_loss": 0.5845786333084106, |
|
"eval_runtime": 4.984, |
|
"eval_samples_per_second": 134.029, |
|
"eval_steps_per_second": 4.213, |
|
"step": 751 |
|
}, |
|
{ |
|
"epoch": 9.10179640718563, |
|
"grad_norm": 4.9026055335998535, |
|
"learning_rate": 3.859883980365908e-05, |
|
"loss": 0.623, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 9.221556886227544, |
|
"grad_norm": 5.39790678024292, |
|
"learning_rate": 3.837572512271307e-05, |
|
"loss": 0.5958, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 9.341317365269461, |
|
"grad_norm": 4.951222896575928, |
|
"learning_rate": 3.815261044176707e-05, |
|
"loss": 0.6206, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 9.461077844311378, |
|
"grad_norm": 3.8360514640808105, |
|
"learning_rate": 3.792949576082106e-05, |
|
"loss": 0.638, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 9.580838323353294, |
|
"grad_norm": 4.05393123626709, |
|
"learning_rate": 3.770638107987506e-05, |
|
"loss": 0.6561, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 9.70059880239521, |
|
"grad_norm": 5.898914337158203, |
|
"learning_rate": 3.748326639892905e-05, |
|
"loss": 0.6197, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 9.820359281437126, |
|
"grad_norm": 4.737303733825684, |
|
"learning_rate": 3.7260151717983045e-05, |
|
"loss": 0.6068, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 9.940119760479043, |
|
"grad_norm": 3.769287347793579, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 0.6261, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7380239520958084, |
|
"eval_loss": 0.568150520324707, |
|
"eval_runtime": 5.0718, |
|
"eval_samples_per_second": 131.708, |
|
"eval_steps_per_second": 4.141, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 10.059880239520957, |
|
"grad_norm": 4.2985920906066895, |
|
"learning_rate": 3.6813922356091035e-05, |
|
"loss": 0.6455, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 10.179640718562874, |
|
"grad_norm": 4.344922065734863, |
|
"learning_rate": 3.659080767514503e-05, |
|
"loss": 0.6438, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 10.29940119760479, |
|
"grad_norm": 4.293480396270752, |
|
"learning_rate": 3.636769299419902e-05, |
|
"loss": 0.6161, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 10.419161676646706, |
|
"grad_norm": 5.124499797821045, |
|
"learning_rate": 3.614457831325301e-05, |
|
"loss": 0.5976, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 10.538922155688622, |
|
"grad_norm": 3.7405636310577393, |
|
"learning_rate": 3.592146363230701e-05, |
|
"loss": 0.6125, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 10.658682634730539, |
|
"grad_norm": 3.614593744277954, |
|
"learning_rate": 3.5698348951361e-05, |
|
"loss": 0.6159, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 10.778443113772456, |
|
"grad_norm": 3.6515111923217773, |
|
"learning_rate": 3.5475234270415e-05, |
|
"loss": 0.6095, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 10.89820359281437, |
|
"grad_norm": 4.4123215675354, |
|
"learning_rate": 3.525211958946899e-05, |
|
"loss": 0.6153, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 10.994011976047904, |
|
"eval_accuracy": 0.718562874251497, |
|
"eval_loss": 0.6006675362586975, |
|
"eval_runtime": 4.9154, |
|
"eval_samples_per_second": 135.9, |
|
"eval_steps_per_second": 4.272, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 11.017964071856287, |
|
"grad_norm": 3.9871232509613037, |
|
"learning_rate": 3.502900490852298e-05, |
|
"loss": 0.6124, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 11.137724550898204, |
|
"grad_norm": 4.505560874938965, |
|
"learning_rate": 3.4805890227576974e-05, |
|
"loss": 0.602, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 11.25748502994012, |
|
"grad_norm": 4.445052623748779, |
|
"learning_rate": 3.4582775546630966e-05, |
|
"loss": 0.6002, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 11.377245508982035, |
|
"grad_norm": 3.532015562057495, |
|
"learning_rate": 3.4359660865684965e-05, |
|
"loss": 0.6242, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 11.497005988023952, |
|
"grad_norm": 6.202284812927246, |
|
"learning_rate": 3.413654618473896e-05, |
|
"loss": 0.6113, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 11.616766467065869, |
|
"grad_norm": 5.8437089920043945, |
|
"learning_rate": 3.3913431503792955e-05, |
|
"loss": 0.6037, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 11.736526946107784, |
|
"grad_norm": 4.447215557098389, |
|
"learning_rate": 3.369031682284695e-05, |
|
"loss": 0.6356, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 11.8562874251497, |
|
"grad_norm": 4.645685195922852, |
|
"learning_rate": 3.346720214190094e-05, |
|
"loss": 0.6309, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 11.976047904191617, |
|
"grad_norm": 4.300328731536865, |
|
"learning_rate": 3.324408746095493e-05, |
|
"loss": 0.6046, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7410179640718563, |
|
"eval_loss": 0.5744568705558777, |
|
"eval_runtime": 4.8113, |
|
"eval_samples_per_second": 138.84, |
|
"eval_steps_per_second": 4.365, |
|
"step": 1002 |
|
}, |
|
{ |
|
"epoch": 12.095808383233534, |
|
"grad_norm": 3.7963852882385254, |
|
"learning_rate": 3.302097278000892e-05, |
|
"loss": 0.5979, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 12.215568862275449, |
|
"grad_norm": 5.136913776397705, |
|
"learning_rate": 3.279785809906292e-05, |
|
"loss": 0.6082, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 12.335329341317365, |
|
"grad_norm": 5.207279205322266, |
|
"learning_rate": 3.257474341811691e-05, |
|
"loss": 0.5884, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 12.455089820359282, |
|
"grad_norm": 4.387267589569092, |
|
"learning_rate": 3.235162873717091e-05, |
|
"loss": 0.6156, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 12.574850299401197, |
|
"grad_norm": 3.0785038471221924, |
|
"learning_rate": 3.21285140562249e-05, |
|
"loss": 0.6093, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 12.694610778443113, |
|
"grad_norm": 3.2741942405700684, |
|
"learning_rate": 3.1905399375278894e-05, |
|
"loss": 0.6033, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 12.81437125748503, |
|
"grad_norm": 4.547226428985596, |
|
"learning_rate": 3.1682284694332886e-05, |
|
"loss": 0.6157, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 12.934131736526947, |
|
"grad_norm": 4.921385765075684, |
|
"learning_rate": 3.1459170013386885e-05, |
|
"loss": 0.5679, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 12.994011976047904, |
|
"eval_accuracy": 0.7230538922155688, |
|
"eval_loss": 0.595708429813385, |
|
"eval_runtime": 4.9064, |
|
"eval_samples_per_second": 136.149, |
|
"eval_steps_per_second": 4.28, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 13.053892215568862, |
|
"grad_norm": 6.123495578765869, |
|
"learning_rate": 3.1236055332440876e-05, |
|
"loss": 0.5848, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 13.173652694610778, |
|
"grad_norm": 3.932276964187622, |
|
"learning_rate": 3.101294065149487e-05, |
|
"loss": 0.6018, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 13.293413173652695, |
|
"grad_norm": 4.827797889709473, |
|
"learning_rate": 3.078982597054887e-05, |
|
"loss": 0.6069, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 13.41317365269461, |
|
"grad_norm": 3.6165199279785156, |
|
"learning_rate": 3.056671128960286e-05, |
|
"loss": 0.6036, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 13.532934131736527, |
|
"grad_norm": 4.21213960647583, |
|
"learning_rate": 3.034359660865685e-05, |
|
"loss": 0.5872, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 13.652694610778443, |
|
"grad_norm": 6.651663780212402, |
|
"learning_rate": 3.012048192771085e-05, |
|
"loss": 0.6057, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 13.77245508982036, |
|
"grad_norm": 5.351555824279785, |
|
"learning_rate": 2.989736724676484e-05, |
|
"loss": 0.5951, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 13.892215568862275, |
|
"grad_norm": 5.491767406463623, |
|
"learning_rate": 2.9674252565818832e-05, |
|
"loss": 0.6027, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7215568862275449, |
|
"eval_loss": 0.5884155631065369, |
|
"eval_runtime": 4.7316, |
|
"eval_samples_per_second": 141.178, |
|
"eval_steps_per_second": 4.438, |
|
"step": 1169 |
|
}, |
|
{ |
|
"epoch": 14.011976047904191, |
|
"grad_norm": 4.720607757568359, |
|
"learning_rate": 2.9451137884872827e-05, |
|
"loss": 0.5779, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 14.131736526946108, |
|
"grad_norm": 4.377676963806152, |
|
"learning_rate": 2.922802320392682e-05, |
|
"loss": 0.5992, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 14.251497005988025, |
|
"grad_norm": 4.529723644256592, |
|
"learning_rate": 2.900490852298081e-05, |
|
"loss": 0.6127, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 14.37125748502994, |
|
"grad_norm": 3.385350465774536, |
|
"learning_rate": 2.878179384203481e-05, |
|
"loss": 0.5876, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 14.491017964071856, |
|
"grad_norm": 5.14049768447876, |
|
"learning_rate": 2.85586791610888e-05, |
|
"loss": 0.5685, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 14.610778443113773, |
|
"grad_norm": 6.226632118225098, |
|
"learning_rate": 2.8335564480142796e-05, |
|
"loss": 0.5957, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 14.730538922155688, |
|
"grad_norm": 4.943429470062256, |
|
"learning_rate": 2.8112449799196788e-05, |
|
"loss": 0.5709, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 14.850299401197605, |
|
"grad_norm": 3.945502758026123, |
|
"learning_rate": 2.788933511825078e-05, |
|
"loss": 0.5807, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 14.970059880239521, |
|
"grad_norm": 5.3703718185424805, |
|
"learning_rate": 2.7666220437304775e-05, |
|
"loss": 0.6249, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 14.994011976047904, |
|
"eval_accuracy": 0.7365269461077845, |
|
"eval_loss": 0.5808472037315369, |
|
"eval_runtime": 4.8318, |
|
"eval_samples_per_second": 138.251, |
|
"eval_steps_per_second": 4.346, |
|
"step": 1252 |
|
}, |
|
{ |
|
"epoch": 15.089820359281438, |
|
"grad_norm": 3.6692726612091064, |
|
"learning_rate": 2.7443105756358774e-05, |
|
"loss": 0.5944, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 15.209580838323353, |
|
"grad_norm": 4.581055164337158, |
|
"learning_rate": 2.7219991075412765e-05, |
|
"loss": 0.5599, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 15.32934131736527, |
|
"grad_norm": 3.810741901397705, |
|
"learning_rate": 2.6996876394466757e-05, |
|
"loss": 0.577, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 15.449101796407186, |
|
"grad_norm": 4.873391151428223, |
|
"learning_rate": 2.6773761713520752e-05, |
|
"loss": 0.565, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 15.568862275449101, |
|
"grad_norm": 4.216405391693115, |
|
"learning_rate": 2.6550647032574744e-05, |
|
"loss": 0.5709, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 15.688622754491018, |
|
"grad_norm": 4.651778697967529, |
|
"learning_rate": 2.6327532351628736e-05, |
|
"loss": 0.5984, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 15.808383233532934, |
|
"grad_norm": 4.126067638397217, |
|
"learning_rate": 2.6104417670682734e-05, |
|
"loss": 0.5718, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 15.928143712574851, |
|
"grad_norm": 3.843979597091675, |
|
"learning_rate": 2.5881302989736726e-05, |
|
"loss": 0.6059, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7350299401197605, |
|
"eval_loss": 0.5699232816696167, |
|
"eval_runtime": 5.1217, |
|
"eval_samples_per_second": 130.426, |
|
"eval_steps_per_second": 4.1, |
|
"step": 1336 |
|
}, |
|
{ |
|
"epoch": 16.047904191616766, |
|
"grad_norm": 3.6033108234405518, |
|
"learning_rate": 2.565818830879072e-05, |
|
"loss": 0.5773, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 16.16766467065868, |
|
"grad_norm": 5.372034549713135, |
|
"learning_rate": 2.5435073627844713e-05, |
|
"loss": 0.5818, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 16.2874251497006, |
|
"grad_norm": 4.0521955490112305, |
|
"learning_rate": 2.5211958946898705e-05, |
|
"loss": 0.5517, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 16.407185628742514, |
|
"grad_norm": 4.455647945404053, |
|
"learning_rate": 2.4988844265952703e-05, |
|
"loss": 0.5904, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 16.526946107784433, |
|
"grad_norm": 4.8622589111328125, |
|
"learning_rate": 2.4765729585006695e-05, |
|
"loss": 0.5943, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 16.646706586826348, |
|
"grad_norm": 5.169972896575928, |
|
"learning_rate": 2.4542614904060687e-05, |
|
"loss": 0.5841, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 16.766467065868262, |
|
"grad_norm": 4.88759708404541, |
|
"learning_rate": 2.4319500223114682e-05, |
|
"loss": 0.5639, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 16.88622754491018, |
|
"grad_norm": 5.843952178955078, |
|
"learning_rate": 2.4096385542168677e-05, |
|
"loss": 0.5776, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 16.994011976047904, |
|
"eval_accuracy": 0.7320359281437125, |
|
"eval_loss": 0.5769894123077393, |
|
"eval_runtime": 5.3536, |
|
"eval_samples_per_second": 124.775, |
|
"eval_steps_per_second": 3.923, |
|
"step": 1419 |
|
}, |
|
{ |
|
"epoch": 17.005988023952096, |
|
"grad_norm": 8.036992073059082, |
|
"learning_rate": 2.387327086122267e-05, |
|
"loss": 0.5775, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 17.12574850299401, |
|
"grad_norm": 8.761232376098633, |
|
"learning_rate": 2.3650156180276664e-05, |
|
"loss": 0.56, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 17.24550898203593, |
|
"grad_norm": 3.9877655506134033, |
|
"learning_rate": 2.3427041499330656e-05, |
|
"loss": 0.5409, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 17.365269461077844, |
|
"grad_norm": 8.304950714111328, |
|
"learning_rate": 2.320392681838465e-05, |
|
"loss": 0.5696, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 17.48502994011976, |
|
"grad_norm": 4.155704498291016, |
|
"learning_rate": 2.2980812137438646e-05, |
|
"loss": 0.5888, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 17.604790419161677, |
|
"grad_norm": 4.174530029296875, |
|
"learning_rate": 2.2757697456492638e-05, |
|
"loss": 0.5566, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 17.724550898203592, |
|
"grad_norm": 6.5803093910217285, |
|
"learning_rate": 2.253458277554663e-05, |
|
"loss": 0.5385, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 17.84431137724551, |
|
"grad_norm": 4.418693542480469, |
|
"learning_rate": 2.2311468094600628e-05, |
|
"loss": 0.5411, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 17.964071856287426, |
|
"grad_norm": 4.162361145019531, |
|
"learning_rate": 2.208835341365462e-05, |
|
"loss": 0.5903, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7215568862275449, |
|
"eval_loss": 0.5806027054786682, |
|
"eval_runtime": 4.6322, |
|
"eval_samples_per_second": 144.209, |
|
"eval_steps_per_second": 4.534, |
|
"step": 1503 |
|
}, |
|
{ |
|
"epoch": 18.08383233532934, |
|
"grad_norm": 4.274399280548096, |
|
"learning_rate": 2.186523873270861e-05, |
|
"loss": 0.5612, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 18.20359281437126, |
|
"grad_norm": 5.013104438781738, |
|
"learning_rate": 2.1642124051762607e-05, |
|
"loss": 0.5654, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 18.323353293413174, |
|
"grad_norm": 4.541128635406494, |
|
"learning_rate": 2.1419009370816602e-05, |
|
"loss": 0.5492, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 18.44311377245509, |
|
"grad_norm": 4.409793853759766, |
|
"learning_rate": 2.1195894689870593e-05, |
|
"loss": 0.564, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 18.562874251497007, |
|
"grad_norm": 3.6504085063934326, |
|
"learning_rate": 2.097278000892459e-05, |
|
"loss": 0.5531, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 18.682634730538922, |
|
"grad_norm": 4.4471235275268555, |
|
"learning_rate": 2.074966532797858e-05, |
|
"loss": 0.5911, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 18.802395209580837, |
|
"grad_norm": 4.53286600112915, |
|
"learning_rate": 2.0526550647032576e-05, |
|
"loss": 0.572, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 18.922155688622755, |
|
"grad_norm": 4.156210899353027, |
|
"learning_rate": 2.030343596608657e-05, |
|
"loss": 0.5633, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 18.994011976047904, |
|
"eval_accuracy": 0.7380239520958084, |
|
"eval_loss": 0.576755166053772, |
|
"eval_runtime": 5.1497, |
|
"eval_samples_per_second": 129.715, |
|
"eval_steps_per_second": 4.078, |
|
"step": 1586 |
|
}, |
|
{ |
|
"epoch": 19.04191616766467, |
|
"grad_norm": 4.36007022857666, |
|
"learning_rate": 2.0080321285140562e-05, |
|
"loss": 0.5304, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 19.161676646706585, |
|
"grad_norm": 6.453935623168945, |
|
"learning_rate": 1.9857206604194558e-05, |
|
"loss": 0.5595, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 19.281437125748504, |
|
"grad_norm": 4.35880708694458, |
|
"learning_rate": 1.9634091923248553e-05, |
|
"loss": 0.5578, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 19.40119760479042, |
|
"grad_norm": 4.454617500305176, |
|
"learning_rate": 1.9410977242302544e-05, |
|
"loss": 0.532, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 19.520958083832337, |
|
"grad_norm": 4.35628604888916, |
|
"learning_rate": 1.9187862561356536e-05, |
|
"loss": 0.5433, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 19.64071856287425, |
|
"grad_norm": 4.488490104675293, |
|
"learning_rate": 1.896474788041053e-05, |
|
"loss": 0.5545, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 19.760479041916167, |
|
"grad_norm": 5.309743881225586, |
|
"learning_rate": 1.8741633199464527e-05, |
|
"loss": 0.5793, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 19.880239520958085, |
|
"grad_norm": 4.599639415740967, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 0.55, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 4.7093892097473145, |
|
"learning_rate": 1.8295403837572513e-05, |
|
"loss": 0.5544, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7350299401197605, |
|
"eval_loss": 0.5829503536224365, |
|
"eval_runtime": 4.8934, |
|
"eval_samples_per_second": 136.511, |
|
"eval_steps_per_second": 4.292, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 20.119760479041915, |
|
"grad_norm": 4.780275344848633, |
|
"learning_rate": 1.8072289156626505e-05, |
|
"loss": 0.5465, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 20.239520958083833, |
|
"grad_norm": 4.53477668762207, |
|
"learning_rate": 1.78491744756805e-05, |
|
"loss": 0.525, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 20.35928143712575, |
|
"grad_norm": 5.508608818054199, |
|
"learning_rate": 1.7626059794734495e-05, |
|
"loss": 0.5768, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 20.479041916167663, |
|
"grad_norm": 5.174022197723389, |
|
"learning_rate": 1.7402945113788487e-05, |
|
"loss": 0.5308, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 20.59880239520958, |
|
"grad_norm": 4.175419330596924, |
|
"learning_rate": 1.7179830432842482e-05, |
|
"loss": 0.5609, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 20.718562874251496, |
|
"grad_norm": 4.944879055023193, |
|
"learning_rate": 1.6956715751896478e-05, |
|
"loss": 0.5323, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 20.83832335329341, |
|
"grad_norm": 4.967810153961182, |
|
"learning_rate": 1.673360107095047e-05, |
|
"loss": 0.5234, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 20.95808383233533, |
|
"grad_norm": 5.018576622009277, |
|
"learning_rate": 1.651048639000446e-05, |
|
"loss": 0.5515, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 20.994011976047904, |
|
"eval_accuracy": 0.7260479041916168, |
|
"eval_loss": 0.5965989232063293, |
|
"eval_runtime": 4.6792, |
|
"eval_samples_per_second": 142.759, |
|
"eval_steps_per_second": 4.488, |
|
"step": 1753 |
|
}, |
|
{ |
|
"epoch": 21.077844311377245, |
|
"grad_norm": 4.204094886779785, |
|
"learning_rate": 1.6287371709058456e-05, |
|
"loss": 0.5672, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 21.197604790419163, |
|
"grad_norm": 4.336850643157959, |
|
"learning_rate": 1.606425702811245e-05, |
|
"loss": 0.5011, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 21.317365269461078, |
|
"grad_norm": 5.5377583503723145, |
|
"learning_rate": 1.5841142347166443e-05, |
|
"loss": 0.5755, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 21.437125748502993, |
|
"grad_norm": 5.438154697418213, |
|
"learning_rate": 1.5618027666220438e-05, |
|
"loss": 0.5349, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 21.55688622754491, |
|
"grad_norm": 5.496122360229492, |
|
"learning_rate": 1.5394912985274433e-05, |
|
"loss": 0.5754, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 21.676646706586826, |
|
"grad_norm": 5.726380348205566, |
|
"learning_rate": 1.5171798304328425e-05, |
|
"loss": 0.5778, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 21.79640718562874, |
|
"grad_norm": 4.841021537780762, |
|
"learning_rate": 1.494868362338242e-05, |
|
"loss": 0.5269, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 21.91616766467066, |
|
"grad_norm": 4.8706583976745605, |
|
"learning_rate": 1.4725568942436414e-05, |
|
"loss": 0.5249, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7335329341317365, |
|
"eval_loss": 0.6078537106513977, |
|
"eval_runtime": 4.6418, |
|
"eval_samples_per_second": 143.909, |
|
"eval_steps_per_second": 4.524, |
|
"step": 1837 |
|
}, |
|
{ |
|
"epoch": 22.035928143712574, |
|
"grad_norm": 6.536875247955322, |
|
"learning_rate": 1.4502454261490405e-05, |
|
"loss": 0.5403, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 22.15568862275449, |
|
"grad_norm": 5.156835079193115, |
|
"learning_rate": 1.42793395805444e-05, |
|
"loss": 0.5547, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 22.275449101796408, |
|
"grad_norm": 4.475517749786377, |
|
"learning_rate": 1.4056224899598394e-05, |
|
"loss": 0.5474, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 22.395209580838323, |
|
"grad_norm": 5.913077354431152, |
|
"learning_rate": 1.3833110218652387e-05, |
|
"loss": 0.5242, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 22.51497005988024, |
|
"grad_norm": 4.135039806365967, |
|
"learning_rate": 1.3609995537706383e-05, |
|
"loss": 0.527, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 22.634730538922156, |
|
"grad_norm": 4.773129940032959, |
|
"learning_rate": 1.3386880856760376e-05, |
|
"loss": 0.5828, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 22.75449101796407, |
|
"grad_norm": 4.826302528381348, |
|
"learning_rate": 1.3163766175814368e-05, |
|
"loss": 0.5407, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 22.87425149700599, |
|
"grad_norm": 5.572017669677734, |
|
"learning_rate": 1.2940651494868363e-05, |
|
"loss": 0.5329, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 22.994011976047904, |
|
"grad_norm": 5.162022113800049, |
|
"learning_rate": 1.2717536813922356e-05, |
|
"loss": 0.5212, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 22.994011976047904, |
|
"eval_accuracy": 0.7245508982035929, |
|
"eval_loss": 0.5972306132316589, |
|
"eval_runtime": 5.0266, |
|
"eval_samples_per_second": 132.892, |
|
"eval_steps_per_second": 4.178, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 23.11377245508982, |
|
"grad_norm": 5.251833915710449, |
|
"learning_rate": 1.2494422132976352e-05, |
|
"loss": 0.5422, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 23.233532934131738, |
|
"grad_norm": 4.400450706481934, |
|
"learning_rate": 1.2271307452030343e-05, |
|
"loss": 0.512, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 23.353293413173652, |
|
"grad_norm": 5.726296901702881, |
|
"learning_rate": 1.2048192771084338e-05, |
|
"loss": 0.5167, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 23.473053892215567, |
|
"grad_norm": 5.346691131591797, |
|
"learning_rate": 1.1825078090138332e-05, |
|
"loss": 0.5554, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 23.592814371257486, |
|
"grad_norm": 6.358211994171143, |
|
"learning_rate": 1.1601963409192325e-05, |
|
"loss": 0.525, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 23.7125748502994, |
|
"grad_norm": 4.755873680114746, |
|
"learning_rate": 1.1378848728246319e-05, |
|
"loss": 0.5479, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 23.83233532934132, |
|
"grad_norm": 6.708542346954346, |
|
"learning_rate": 1.1155734047300314e-05, |
|
"loss": 0.5457, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 23.952095808383234, |
|
"grad_norm": 4.143124103546143, |
|
"learning_rate": 1.0932619366354306e-05, |
|
"loss": 0.5268, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7230538922155688, |
|
"eval_loss": 0.5921865701675415, |
|
"eval_runtime": 5.3047, |
|
"eval_samples_per_second": 125.926, |
|
"eval_steps_per_second": 3.959, |
|
"step": 2004 |
|
}, |
|
{ |
|
"epoch": 24.07185628742515, |
|
"grad_norm": 5.585824489593506, |
|
"learning_rate": 1.0709504685408301e-05, |
|
"loss": 0.4723, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 24.191616766467067, |
|
"grad_norm": 5.241277694702148, |
|
"learning_rate": 1.0486390004462294e-05, |
|
"loss": 0.5317, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 24.311377245508982, |
|
"grad_norm": 5.840533256530762, |
|
"learning_rate": 1.0263275323516288e-05, |
|
"loss": 0.5458, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 24.431137724550897, |
|
"grad_norm": 5.158961772918701, |
|
"learning_rate": 1.0040160642570281e-05, |
|
"loss": 0.5169, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 24.550898203592816, |
|
"grad_norm": 4.132058620452881, |
|
"learning_rate": 9.817045961624276e-06, |
|
"loss": 0.531, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 24.67065868263473, |
|
"grad_norm": 5.0692830085754395, |
|
"learning_rate": 9.593931280678268e-06, |
|
"loss": 0.5462, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 24.790419161676645, |
|
"grad_norm": 5.38627815246582, |
|
"learning_rate": 9.370816599732263e-06, |
|
"loss": 0.5151, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 24.910179640718564, |
|
"grad_norm": 5.605633735656738, |
|
"learning_rate": 9.147701918786257e-06, |
|
"loss": 0.5406, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 24.994011976047904, |
|
"eval_accuracy": 0.7350299401197605, |
|
"eval_loss": 0.609959602355957, |
|
"eval_runtime": 5.5754, |
|
"eval_samples_per_second": 119.811, |
|
"eval_steps_per_second": 3.767, |
|
"step": 2087 |
|
}, |
|
{ |
|
"epoch": 25.02994011976048, |
|
"grad_norm": 5.846259117126465, |
|
"learning_rate": 8.92458723784025e-06, |
|
"loss": 0.5213, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 25.149700598802394, |
|
"grad_norm": 7.1593708992004395, |
|
"learning_rate": 8.701472556894244e-06, |
|
"loss": 0.5329, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 25.269461077844312, |
|
"grad_norm": 5.0290985107421875, |
|
"learning_rate": 8.478357875948239e-06, |
|
"loss": 0.5042, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 25.389221556886227, |
|
"grad_norm": 4.644502639770508, |
|
"learning_rate": 8.25524319500223e-06, |
|
"loss": 0.5227, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 25.508982035928145, |
|
"grad_norm": 5.292252540588379, |
|
"learning_rate": 8.032128514056226e-06, |
|
"loss": 0.5137, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 25.62874251497006, |
|
"grad_norm": 4.673940658569336, |
|
"learning_rate": 7.809013833110219e-06, |
|
"loss": 0.5117, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 25.748502994011975, |
|
"grad_norm": 5.361245632171631, |
|
"learning_rate": 7.5858991521642126e-06, |
|
"loss": 0.5067, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 25.868263473053894, |
|
"grad_norm": 4.769536018371582, |
|
"learning_rate": 7.362784471218207e-06, |
|
"loss": 0.5562, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 25.98802395209581, |
|
"grad_norm": 5.589327335357666, |
|
"learning_rate": 7.1396697902722e-06, |
|
"loss": 0.5257, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7305389221556886, |
|
"eval_loss": 0.6003913879394531, |
|
"eval_runtime": 5.6113, |
|
"eval_samples_per_second": 119.045, |
|
"eval_steps_per_second": 3.742, |
|
"step": 2171 |
|
}, |
|
{ |
|
"epoch": 26.107784431137723, |
|
"grad_norm": 5.76840353012085, |
|
"learning_rate": 6.916555109326194e-06, |
|
"loss": 0.5305, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 26.227544910179642, |
|
"grad_norm": 4.287968158721924, |
|
"learning_rate": 6.693440428380188e-06, |
|
"loss": 0.5028, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 26.347305389221557, |
|
"grad_norm": 7.93202543258667, |
|
"learning_rate": 6.4703257474341815e-06, |
|
"loss": 0.5173, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 26.46706586826347, |
|
"grad_norm": 5.515824794769287, |
|
"learning_rate": 6.247211066488176e-06, |
|
"loss": 0.5111, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 26.58682634730539, |
|
"grad_norm": 7.206600666046143, |
|
"learning_rate": 6.024096385542169e-06, |
|
"loss": 0.5112, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 26.706586826347305, |
|
"grad_norm": 4.152039527893066, |
|
"learning_rate": 5.800981704596163e-06, |
|
"loss": 0.4995, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 26.82634730538922, |
|
"grad_norm": 5.948368072509766, |
|
"learning_rate": 5.577867023650157e-06, |
|
"loss": 0.5431, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 26.94610778443114, |
|
"grad_norm": 4.189924240112305, |
|
"learning_rate": 5.3547523427041504e-06, |
|
"loss": 0.5152, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 26.994011976047904, |
|
"eval_accuracy": 0.7320359281437125, |
|
"eval_loss": 0.6091659665107727, |
|
"eval_runtime": 4.8264, |
|
"eval_samples_per_second": 138.405, |
|
"eval_steps_per_second": 4.351, |
|
"step": 2254 |
|
}, |
|
{ |
|
"epoch": 27.065868263473053, |
|
"grad_norm": 5.771645545959473, |
|
"learning_rate": 5.131637661758144e-06, |
|
"loss": 0.5083, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 27.18562874251497, |
|
"grad_norm": 5.393200397491455, |
|
"learning_rate": 4.908522980812138e-06, |
|
"loss": 0.4818, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 27.305389221556887, |
|
"grad_norm": 5.9255828857421875, |
|
"learning_rate": 4.685408299866132e-06, |
|
"loss": 0.5215, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 27.4251497005988, |
|
"grad_norm": 5.309273719787598, |
|
"learning_rate": 4.462293618920125e-06, |
|
"loss": 0.4627, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 27.54491017964072, |
|
"grad_norm": 6.227340221405029, |
|
"learning_rate": 4.239178937974119e-06, |
|
"loss": 0.5067, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 27.664670658682635, |
|
"grad_norm": 4.3603105545043945, |
|
"learning_rate": 4.016064257028113e-06, |
|
"loss": 0.5189, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 27.78443113772455, |
|
"grad_norm": 5.04020357131958, |
|
"learning_rate": 3.7929495760821063e-06, |
|
"loss": 0.5311, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 27.904191616766468, |
|
"grad_norm": 4.995678901672363, |
|
"learning_rate": 3.5698348951361e-06, |
|
"loss": 0.4858, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7230538922155688, |
|
"eval_loss": 0.6100460886955261, |
|
"eval_runtime": 5.2608, |
|
"eval_samples_per_second": 126.978, |
|
"eval_steps_per_second": 3.992, |
|
"step": 2338 |
|
}, |
|
{ |
|
"epoch": 28.023952095808383, |
|
"grad_norm": 3.8030707836151123, |
|
"learning_rate": 3.346720214190094e-06, |
|
"loss": 0.5329, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 28.143712574850298, |
|
"grad_norm": 4.550446033477783, |
|
"learning_rate": 3.123605533244088e-06, |
|
"loss": 0.4682, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 28.263473053892216, |
|
"grad_norm": 4.607069492340088, |
|
"learning_rate": 2.9004908522980813e-06, |
|
"loss": 0.4928, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 28.38323353293413, |
|
"grad_norm": 4.880204677581787, |
|
"learning_rate": 2.6773761713520752e-06, |
|
"loss": 0.5187, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 28.50299401197605, |
|
"grad_norm": 4.848081111907959, |
|
"learning_rate": 2.454261490406069e-06, |
|
"loss": 0.5051, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 28.622754491017965, |
|
"grad_norm": 5.9451189041137695, |
|
"learning_rate": 2.2311468094600625e-06, |
|
"loss": 0.5156, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 28.74251497005988, |
|
"grad_norm": 5.852599143981934, |
|
"learning_rate": 2.0080321285140564e-06, |
|
"loss": 0.4913, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 28.862275449101798, |
|
"grad_norm": 5.628122329711914, |
|
"learning_rate": 1.78491744756805e-06, |
|
"loss": 0.4711, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 28.982035928143713, |
|
"grad_norm": 5.2325358390808105, |
|
"learning_rate": 1.561802766622044e-06, |
|
"loss": 0.5412, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 28.994011976047904, |
|
"eval_accuracy": 0.7350299401197605, |
|
"eval_loss": 0.6115620136260986, |
|
"eval_runtime": 5.4339, |
|
"eval_samples_per_second": 122.932, |
|
"eval_steps_per_second": 3.865, |
|
"step": 2421 |
|
}, |
|
{ |
|
"epoch": 29.101796407185628, |
|
"grad_norm": 5.715780258178711, |
|
"learning_rate": 1.3386880856760376e-06, |
|
"loss": 0.528, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 29.221556886227546, |
|
"grad_norm": 5.140947341918945, |
|
"learning_rate": 1.1155734047300313e-06, |
|
"loss": 0.5015, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 29.34131736526946, |
|
"grad_norm": 5.15585470199585, |
|
"learning_rate": 8.92458723784025e-07, |
|
"loss": 0.4934, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 29.461077844311376, |
|
"grad_norm": 3.845532178878784, |
|
"learning_rate": 6.693440428380188e-07, |
|
"loss": 0.4809, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 29.580838323353294, |
|
"grad_norm": 5.481026649475098, |
|
"learning_rate": 4.462293618920125e-07, |
|
"loss": 0.4645, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 29.70059880239521, |
|
"grad_norm": 4.100111484527588, |
|
"learning_rate": 2.2311468094600626e-07, |
|
"loss": 0.4856, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 29.820359281437124, |
|
"grad_norm": 5.421600818634033, |
|
"learning_rate": 0.0, |
|
"loss": 0.4972, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 29.820359281437124, |
|
"eval_accuracy": 0.7290419161676647, |
|
"eval_loss": 0.6119701862335205, |
|
"eval_runtime": 4.8411, |
|
"eval_samples_per_second": 137.985, |
|
"eval_steps_per_second": 4.338, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 29.820359281437124, |
|
"step": 2490, |
|
"total_flos": 7.910788670992908e+18, |
|
"train_loss": 0.5937201630159554, |
|
"train_runtime": 3590.0203, |
|
"train_samples_per_second": 89.181, |
|
"train_steps_per_second": 0.694 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2490, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 30, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.910788670992908e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|