eugenepentland commited on
Commit
a2ef6f2
1 Parent(s): d72709f

Training in progress, epoch 5, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4862ebcdc8d2768d6132069a64ba7dfa94cdbf6455d4d4560f35f71fcc32dfb
3
  size 272138666
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c448104ce2d154398926b10f5da1a068cc005b858ff145e388bffd773d180f87
3
  size 272138666
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:094d5e98b1a3b1417249f16f63b4282afd3829ec4a31dbc2bc00df331ba5faf1
3
  size 136067312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d522f48035551cde94f250a105d3f95296a4bbeab93d5b3d3564c104aecb6596
3
  size 136067312
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:67b5b6dd9206c42165f5188e8effb6be1e40ed13cd768d8b7a1b17575bf03d9e
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d46f7c0ba4feb2125c13deda1ba0b721e0efe91bcba4140ca17136cad45c48a3
3
  size 21687
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6828f225fa32178c33ef119d710f22e24b0bc0c656e9d474379f8495e0908384
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85312c5143eb72e5e61ba4a319c997e347a39a399dd2227831e2d75a9642adec
3
  size 21687
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35c188ece0af9a1c1c070d68232ff9e3dc42d760df0f2e5e280f4c2013a3e538
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6a8352611a4312779c3a8d7ccc5cd6742e36dab552d14e699adf613271f5c02
3
  size 21687
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1f9da3ceb24bed58a42dc4e81d0a1e02d0fbd589dd70fb982262e3a3a271213
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8b37613fe778cfe2e6f052607845b13507fca8d228aab57a081250d0579c8f0c
3
  size 21687
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:05f3bf8804c8e87d47beaafc30681cc2d8abc53f079c3ec27b8405f27eafb62e
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe1719e13a0f31b2a8346b234c197bedeb66d5e3a9ca796b9296180e2a5acd65
3
  size 21687
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4a3224faf90dcc36243bfcd086fe8dbeb1b17d0f25a5e2b7d1d315effde3250
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee6061d4eb9c4722da13bb14acdbd266fcc9780b11acf895ab0612c1b31f5a61
3
  size 21687
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4feb4ed70cfe7ab739f8d7012e2560e686d994777ba7aa8513abce68d4c42d9a
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f111ad4c6d15bd49b5db0a3a88b93cea27f4f9c389fa8e486ed9639c4a3d613c
3
  size 21687
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1934f6a0f7b05c9571484b9dbea2c6fb5ae573367912d95f0629f9b4ebdf3aa7
3
  size 21687
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:111c33490270e9bb9e95ce9eed65ea10f26f937a0aec179323d14088fa7d6a98
3
  size 21687
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f1d49d9630e070befda78ef6b64c8fdc0bb6b5103c4e1c8f4b6fa80c9bcbdbb4
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed29c8a99623c2a97230e7e5a3a6a12bd70f67f1cb68462fbad60d321f9a3ce3
3
  size 627
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.0,
5
  "eval_steps": 25,
6
- "global_step": 618,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -937,13 +937,647 @@
937
  "learning_rate": 1.7493335291185675e-05,
938
  "loss": 0.0324,
939
  "step": 615
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
940
  }
941
  ],
942
  "logging_steps": 5,
943
  "max_steps": 1030,
944
  "num_train_epochs": 5,
945
  "save_steps": 500,
946
- "total_flos": 275559470923776.0,
947
  "trial_name": null,
948
  "trial_params": null
949
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 5.0,
5
  "eval_steps": 25,
6
+ "global_step": 1030,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
937
  "learning_rate": 1.7493335291185675e-05,
938
  "loss": 0.0324,
939
  "step": 615
940
+ },
941
+ {
942
+ "epoch": 3.01,
943
+ "learning_rate": 1.7130543699071327e-05,
944
+ "loss": 0.0459,
945
+ "step": 620
946
+ },
947
+ {
948
+ "epoch": 3.03,
949
+ "learning_rate": 1.67695824264883e-05,
950
+ "loss": 0.0365,
951
+ "step": 625
952
+ },
953
+ {
954
+ "epoch": 3.03,
955
+ "eval_loss": 0.04022263363003731,
956
+ "eval_runtime": 0.1169,
957
+ "eval_samples_per_second": 3516.3,
958
+ "eval_steps_per_second": 111.221,
959
+ "step": 625
960
+ },
961
+ {
962
+ "epoch": 3.06,
963
+ "learning_rate": 1.6410535427707634e-05,
964
+ "loss": 0.0307,
965
+ "step": 630
966
+ },
967
+ {
968
+ "epoch": 3.08,
969
+ "learning_rate": 1.6053486211768503e-05,
970
+ "loss": 0.0316,
971
+ "step": 635
972
+ },
973
+ {
974
+ "epoch": 3.11,
975
+ "learning_rate": 1.5698517823055242e-05,
976
+ "loss": 0.0236,
977
+ "step": 640
978
+ },
979
+ {
980
+ "epoch": 3.13,
981
+ "learning_rate": 1.5345712821982443e-05,
982
+ "loss": 0.0187,
983
+ "step": 645
984
+ },
985
+ {
986
+ "epoch": 3.16,
987
+ "learning_rate": 1.4995153265792642e-05,
988
+ "loss": 0.0282,
989
+ "step": 650
990
+ },
991
+ {
992
+ "epoch": 3.16,
993
+ "eval_loss": 0.04607125371694565,
994
+ "eval_runtime": 0.1202,
995
+ "eval_samples_per_second": 3419.582,
996
+ "eval_steps_per_second": 108.162,
997
+ "step": 650
998
+ },
999
+ {
1000
+ "epoch": 3.18,
1001
+ "learning_rate": 1.4646920689470967e-05,
1002
+ "loss": 0.013,
1003
+ "step": 655
1004
+ },
1005
+ {
1006
+ "epoch": 3.2,
1007
+ "learning_rate": 1.4301096086781363e-05,
1008
+ "loss": 0.0035,
1009
+ "step": 660
1010
+ },
1011
+ {
1012
+ "epoch": 3.23,
1013
+ "learning_rate": 1.3957759891428568e-05,
1014
+ "loss": 0.0043,
1015
+ "step": 665
1016
+ },
1017
+ {
1018
+ "epoch": 3.25,
1019
+ "learning_rate": 1.3616991958350494e-05,
1020
+ "loss": 0.0072,
1021
+ "step": 670
1022
+ },
1023
+ {
1024
+ "epoch": 3.28,
1025
+ "learning_rate": 1.3278871545145084e-05,
1026
+ "loss": 0.0391,
1027
+ "step": 675
1028
+ },
1029
+ {
1030
+ "epoch": 3.28,
1031
+ "eval_loss": 0.04361347481608391,
1032
+ "eval_runtime": 0.1163,
1033
+ "eval_samples_per_second": 3535.033,
1034
+ "eval_steps_per_second": 111.814,
1035
+ "step": 675
1036
+ },
1037
+ {
1038
+ "epoch": 3.3,
1039
+ "learning_rate": 1.294347729363618e-05,
1040
+ "loss": 0.0266,
1041
+ "step": 680
1042
+ },
1043
+ {
1044
+ "epoch": 3.33,
1045
+ "learning_rate": 1.2610887211582612e-05,
1046
+ "loss": 0.0256,
1047
+ "step": 685
1048
+ },
1049
+ {
1050
+ "epoch": 3.35,
1051
+ "learning_rate": 1.2281178654534734e-05,
1052
+ "loss": 0.0311,
1053
+ "step": 690
1054
+ },
1055
+ {
1056
+ "epoch": 3.37,
1057
+ "learning_rate": 1.1954428307842646e-05,
1058
+ "loss": 0.0054,
1059
+ "step": 695
1060
+ },
1061
+ {
1062
+ "epoch": 3.4,
1063
+ "learning_rate": 1.1630712168820351e-05,
1064
+ "loss": 0.0282,
1065
+ "step": 700
1066
+ },
1067
+ {
1068
+ "epoch": 3.4,
1069
+ "eval_loss": 0.04334261640906334,
1070
+ "eval_runtime": 0.1164,
1071
+ "eval_samples_per_second": 3530.458,
1072
+ "eval_steps_per_second": 111.669,
1073
+ "step": 700
1074
+ },
1075
+ {
1076
+ "epoch": 3.42,
1077
+ "learning_rate": 1.1310105529069844e-05,
1078
+ "loss": 0.0409,
1079
+ "step": 705
1080
+ },
1081
+ {
1082
+ "epoch": 3.45,
1083
+ "learning_rate": 1.0992682956969458e-05,
1084
+ "loss": 0.0331,
1085
+ "step": 710
1086
+ },
1087
+ {
1088
+ "epoch": 3.47,
1089
+ "learning_rate": 1.0678518280330296e-05,
1090
+ "loss": 0.0116,
1091
+ "step": 715
1092
+ },
1093
+ {
1094
+ "epoch": 3.5,
1095
+ "learning_rate": 1.0367684569225028e-05,
1096
+ "loss": 0.0037,
1097
+ "step": 720
1098
+ },
1099
+ {
1100
+ "epoch": 3.52,
1101
+ "learning_rate": 1.0060254118992837e-05,
1102
+ "loss": 0.0465,
1103
+ "step": 725
1104
+ },
1105
+ {
1106
+ "epoch": 3.52,
1107
+ "eval_loss": 0.048130132257938385,
1108
+ "eval_runtime": 0.1194,
1109
+ "eval_samples_per_second": 3441.283,
1110
+ "eval_steps_per_second": 108.848,
1111
+ "step": 725
1112
+ },
1113
+ {
1114
+ "epoch": 3.54,
1115
+ "learning_rate": 9.756298433424587e-06,
1116
+ "loss": 0.1006,
1117
+ "step": 730
1118
+ },
1119
+ {
1120
+ "epoch": 3.57,
1121
+ "learning_rate": 9.455888208132102e-06,
1122
+ "loss": 0.0071,
1123
+ "step": 735
1124
+ },
1125
+ {
1126
+ "epoch": 3.59,
1127
+ "learning_rate": 9.159093314105405e-06,
1128
+ "loss": 0.0139,
1129
+ "step": 740
1130
+ },
1131
+ {
1132
+ "epoch": 3.62,
1133
+ "learning_rate": 8.865982781461791e-06,
1134
+ "loss": 0.004,
1135
+ "step": 745
1136
+ },
1137
+ {
1138
+ "epoch": 3.64,
1139
+ "learning_rate": 8.576624783390367e-06,
1140
+ "loss": 0.0557,
1141
+ "step": 750
1142
+ },
1143
+ {
1144
+ "epoch": 3.64,
1145
+ "eval_loss": 0.04551706090569496,
1146
+ "eval_runtime": 0.1186,
1147
+ "eval_samples_per_second": 3464.82,
1148
+ "eval_steps_per_second": 109.593,
1149
+ "step": 750
1150
+ },
1151
+ {
1152
+ "epoch": 3.67,
1153
+ "learning_rate": 8.291086620296052e-06,
1154
+ "loss": 0.0347,
1155
+ "step": 755
1156
+ },
1157
+ {
1158
+ "epoch": 3.69,
1159
+ "learning_rate": 8.009434704146424e-06,
1160
+ "loss": 0.0413,
1161
+ "step": 760
1162
+ },
1163
+ {
1164
+ "epoch": 3.71,
1165
+ "learning_rate": 7.731734543025345e-06,
1166
+ "loss": 0.0147,
1167
+ "step": 765
1168
+ },
1169
+ {
1170
+ "epoch": 3.74,
1171
+ "learning_rate": 7.458050725896673e-06,
1172
+ "loss": 0.0502,
1173
+ "step": 770
1174
+ },
1175
+ {
1176
+ "epoch": 3.76,
1177
+ "learning_rate": 7.188446907581894e-06,
1178
+ "loss": 0.058,
1179
+ "step": 775
1180
+ },
1181
+ {
1182
+ "epoch": 3.76,
1183
+ "eval_loss": 0.04955735430121422,
1184
+ "eval_runtime": 0.1466,
1185
+ "eval_samples_per_second": 2802.9,
1186
+ "eval_steps_per_second": 88.656,
1187
+ "step": 775
1188
+ },
1189
+ {
1190
+ "epoch": 3.79,
1191
+ "learning_rate": 6.922985793954881e-06,
1192
+ "loss": 0.0127,
1193
+ "step": 780
1194
+ },
1195
+ {
1196
+ "epoch": 3.81,
1197
+ "learning_rate": 6.661729127357494e-06,
1198
+ "loss": 0.0335,
1199
+ "step": 785
1200
+ },
1201
+ {
1202
+ "epoch": 3.83,
1203
+ "learning_rate": 6.404737672239173e-06,
1204
+ "loss": 0.0271,
1205
+ "step": 790
1206
+ },
1207
+ {
1208
+ "epoch": 3.86,
1209
+ "learning_rate": 6.1520712010240455e-06,
1210
+ "loss": 0.0512,
1211
+ "step": 795
1212
+ },
1213
+ {
1214
+ "epoch": 3.88,
1215
+ "learning_rate": 5.9037884802087325e-06,
1216
+ "loss": 0.0057,
1217
+ "step": 800
1218
+ },
1219
+ {
1220
+ "epoch": 3.88,
1221
+ "eval_loss": 0.051423329859972,
1222
+ "eval_runtime": 0.1163,
1223
+ "eval_samples_per_second": 3533.425,
1224
+ "eval_steps_per_second": 111.763,
1225
+ "step": 800
1226
+ },
1227
+ {
1228
+ "epoch": 3.91,
1229
+ "learning_rate": 5.659947256694156e-06,
1230
+ "loss": 0.0029,
1231
+ "step": 805
1232
+ },
1233
+ {
1234
+ "epoch": 3.93,
1235
+ "learning_rate": 5.420604244354408e-06,
1236
+ "loss": 0.0095,
1237
+ "step": 810
1238
+ },
1239
+ {
1240
+ "epoch": 3.96,
1241
+ "learning_rate": 5.185815110845996e-06,
1242
+ "loss": 0.0491,
1243
+ "step": 815
1244
+ },
1245
+ {
1246
+ "epoch": 3.98,
1247
+ "learning_rate": 4.9556344646603165e-06,
1248
+ "loss": 0.0251,
1249
+ "step": 820
1250
+ },
1251
+ {
1252
+ "epoch": 4.0,
1253
+ "learning_rate": 4.73011584242257e-06,
1254
+ "loss": 0.032,
1255
+ "step": 825
1256
+ },
1257
+ {
1258
+ "epoch": 4.0,
1259
+ "eval_loss": 0.05062270909547806,
1260
+ "eval_runtime": 0.1194,
1261
+ "eval_samples_per_second": 3442.19,
1262
+ "eval_steps_per_second": 108.877,
1263
+ "step": 825
1264
+ },
1265
+ {
1266
+ "epoch": 4.03,
1267
+ "learning_rate": 4.509311696439903e-06,
1268
+ "loss": 0.0354,
1269
+ "step": 830
1270
+ },
1271
+ {
1272
+ "epoch": 4.05,
1273
+ "learning_rate": 4.293273382501775e-06,
1274
+ "loss": 0.0737,
1275
+ "step": 835
1276
+ },
1277
+ {
1278
+ "epoch": 4.08,
1279
+ "learning_rate": 4.082051147935373e-06,
1280
+ "loss": 0.0198,
1281
+ "step": 840
1282
+ },
1283
+ {
1284
+ "epoch": 4.1,
1285
+ "learning_rate": 3.875694119918805e-06,
1286
+ "loss": 0.0266,
1287
+ "step": 845
1288
+ },
1289
+ {
1290
+ "epoch": 4.13,
1291
+ "learning_rate": 3.6742502940548734e-06,
1292
+ "loss": 0.0056,
1293
+ "step": 850
1294
+ },
1295
+ {
1296
+ "epoch": 4.13,
1297
+ "eval_loss": 0.048265255987644196,
1298
+ "eval_runtime": 0.1135,
1299
+ "eval_samples_per_second": 3619.583,
1300
+ "eval_steps_per_second": 114.488,
1301
+ "step": 850
1302
+ },
1303
+ {
1304
+ "epoch": 4.15,
1305
+ "learning_rate": 3.477766523207965e-06,
1306
+ "loss": 0.0017,
1307
+ "step": 855
1308
+ },
1309
+ {
1310
+ "epoch": 4.17,
1311
+ "learning_rate": 3.286288506606805e-06,
1312
+ "loss": 0.0009,
1313
+ "step": 860
1314
+ },
1315
+ {
1316
+ "epoch": 4.2,
1317
+ "learning_rate": 3.0998607792154495e-06,
1318
+ "loss": 0.0153,
1319
+ "step": 865
1320
+ },
1321
+ {
1322
+ "epoch": 4.22,
1323
+ "learning_rate": 2.91852670137516e-06,
1324
+ "loss": 0.0232,
1325
+ "step": 870
1326
+ },
1327
+ {
1328
+ "epoch": 4.25,
1329
+ "learning_rate": 2.7423284487193697e-06,
1330
+ "loss": 0.0104,
1331
+ "step": 875
1332
+ },
1333
+ {
1334
+ "epoch": 4.25,
1335
+ "eval_loss": 0.048452552407979965,
1336
+ "eval_runtime": 0.1158,
1337
+ "eval_samples_per_second": 3550.622,
1338
+ "eval_steps_per_second": 112.307,
1339
+ "step": 875
1340
+ },
1341
+ {
1342
+ "epoch": 4.27,
1343
+ "learning_rate": 2.571307002364301e-06,
1344
+ "loss": 0.0146,
1345
+ "step": 880
1346
+ },
1347
+ {
1348
+ "epoch": 4.3,
1349
+ "learning_rate": 2.4055021393773277e-06,
1350
+ "loss": 0.062,
1351
+ "step": 885
1352
+ },
1353
+ {
1354
+ "epoch": 4.32,
1355
+ "learning_rate": 2.2449524235254484e-06,
1356
+ "loss": 0.0415,
1357
+ "step": 890
1358
+ },
1359
+ {
1360
+ "epoch": 4.34,
1361
+ "learning_rate": 2.089695196305888e-06,
1362
+ "loss": 0.0037,
1363
+ "step": 895
1364
+ },
1365
+ {
1366
+ "epoch": 4.37,
1367
+ "learning_rate": 1.939766568261037e-06,
1368
+ "loss": 0.0353,
1369
+ "step": 900
1370
+ },
1371
+ {
1372
+ "epoch": 4.37,
1373
+ "eval_loss": 0.04834846034646034,
1374
+ "eval_runtime": 0.1173,
1375
+ "eval_samples_per_second": 3503.757,
1376
+ "eval_steps_per_second": 110.824,
1377
+ "step": 900
1378
+ },
1379
+ {
1380
+ "epoch": 4.39,
1381
+ "learning_rate": 1.7952014105796666e-06,
1382
+ "loss": 0.0316,
1383
+ "step": 905
1384
+ },
1385
+ {
1386
+ "epoch": 4.42,
1387
+ "learning_rate": 1.656033346986416e-06,
1388
+ "loss": 0.0238,
1389
+ "step": 910
1390
+ },
1391
+ {
1392
+ "epoch": 4.44,
1393
+ "learning_rate": 1.5222947459213793e-06,
1394
+ "loss": 0.025,
1395
+ "step": 915
1396
+ },
1397
+ {
1398
+ "epoch": 4.47,
1399
+ "learning_rate": 1.3940167130117226e-06,
1400
+ "loss": 0.0076,
1401
+ "step": 920
1402
+ },
1403
+ {
1404
+ "epoch": 4.49,
1405
+ "learning_rate": 1.2712290838369363e-06,
1406
+ "loss": 0.0044,
1407
+ "step": 925
1408
+ },
1409
+ {
1410
+ "epoch": 4.49,
1411
+ "eval_loss": 0.0482080839574337,
1412
+ "eval_runtime": 0.1126,
1413
+ "eval_samples_per_second": 3651.3,
1414
+ "eval_steps_per_second": 115.491,
1415
+ "step": 925
1416
+ },
1417
+ {
1418
+ "epoch": 4.51,
1419
+ "learning_rate": 1.1539604169895497e-06,
1420
+ "loss": 0.0038,
1421
+ "step": 930
1422
+ },
1423
+ {
1424
+ "epoch": 4.54,
1425
+ "learning_rate": 1.0422379874328008e-06,
1426
+ "loss": 0.0038,
1427
+ "step": 935
1428
+ },
1429
+ {
1430
+ "epoch": 4.56,
1431
+ "learning_rate": 9.360877801568968e-07,
1432
+ "loss": 0.0463,
1433
+ "step": 940
1434
+ },
1435
+ {
1436
+ "epoch": 4.59,
1437
+ "learning_rate": 8.355344841352836e-07,
1438
+ "loss": 0.001,
1439
+ "step": 945
1440
+ },
1441
+ {
1442
+ "epoch": 4.61,
1443
+ "learning_rate": 7.406014865823513e-07,
1444
+ "loss": 0.0449,
1445
+ "step": 950
1446
+ },
1447
+ {
1448
+ "epoch": 4.61,
1449
+ "eval_loss": 0.04827665537595749,
1450
+ "eval_runtime": 0.1168,
1451
+ "eval_samples_per_second": 3519.71,
1452
+ "eval_steps_per_second": 111.329,
1453
+ "step": 950
1454
+ },
1455
+ {
1456
+ "epoch": 4.64,
1457
+ "learning_rate": 6.513108675139101e-07,
1458
+ "loss": 0.0138,
1459
+ "step": 955
1460
+ },
1461
+ {
1462
+ "epoch": 4.66,
1463
+ "learning_rate": 5.676833946117205e-07,
1464
+ "loss": 0.0407,
1465
+ "step": 960
1466
+ },
1467
+ {
1468
+ "epoch": 4.68,
1469
+ "learning_rate": 4.897385183932179e-07,
1470
+ "loss": 0.0392,
1471
+ "step": 965
1472
+ },
1473
+ {
1474
+ "epoch": 4.71,
1475
+ "learning_rate": 4.1749436768762084e-07,
1476
+ "loss": 0.0358,
1477
+ "step": 970
1478
+ },
1479
+ {
1480
+ "epoch": 4.73,
1481
+ "learning_rate": 3.509677454194282e-07,
1482
+ "loss": 0.0626,
1483
+ "step": 975
1484
+ },
1485
+ {
1486
+ "epoch": 4.73,
1487
+ "eval_loss": 0.04846416041254997,
1488
+ "eval_runtime": 0.116,
1489
+ "eval_samples_per_second": 3543.615,
1490
+ "eval_steps_per_second": 112.085,
1491
+ "step": 975
1492
+ },
1493
+ {
1494
+ "epoch": 4.76,
1495
+ "learning_rate": 2.901741247002987e-07,
1496
+ "loss": 0.0069,
1497
+ "step": 980
1498
+ },
1499
+ {
1500
+ "epoch": 4.78,
1501
+ "learning_rate": 2.3512764523025698e-07,
1502
+ "loss": 0.0697,
1503
+ "step": 985
1504
+ },
1505
+ {
1506
+ "epoch": 4.81,
1507
+ "learning_rate": 1.8584111000897464e-07,
1508
+ "loss": 0.03,
1509
+ "step": 990
1510
+ },
1511
+ {
1512
+ "epoch": 4.83,
1513
+ "learning_rate": 1.423259823580131e-07,
1514
+ "loss": 0.0007,
1515
+ "step": 995
1516
+ },
1517
+ {
1518
+ "epoch": 4.85,
1519
+ "learning_rate": 1.0459238325460363e-07,
1520
+ "loss": 0.0631,
1521
+ "step": 1000
1522
+ },
1523
+ {
1524
+ "epoch": 4.85,
1525
+ "eval_loss": 0.0483647957444191,
1526
+ "eval_runtime": 0.146,
1527
+ "eval_samples_per_second": 2815.962,
1528
+ "eval_steps_per_second": 89.069,
1529
+ "step": 1000
1530
+ },
1531
+ {
1532
+ "epoch": 4.88,
1533
+ "learning_rate": 7.264908897766098e-08,
1534
+ "loss": 0.0569,
1535
+ "step": 1005
1536
+ },
1537
+ {
1538
+ "epoch": 4.9,
1539
+ "learning_rate": 4.650352906655775e-08,
1540
+ "loss": 0.0304,
1541
+ "step": 1010
1542
+ },
1543
+ {
1544
+ "epoch": 4.93,
1545
+ "learning_rate": 2.616178459311225e-08,
1546
+ "loss": 0.0101,
1547
+ "step": 1015
1548
+ },
1549
+ {
1550
+ "epoch": 4.95,
1551
+ "learning_rate": 1.1628586747233772e-08,
1552
+ "loss": 0.0035,
1553
+ "step": 1020
1554
+ },
1555
+ {
1556
+ "epoch": 4.98,
1557
+ "learning_rate": 2.9073157365056002e-09,
1558
+ "loss": 0.0054,
1559
+ "step": 1025
1560
+ },
1561
+ {
1562
+ "epoch": 4.98,
1563
+ "eval_loss": 0.04826511815190315,
1564
+ "eval_runtime": 0.1121,
1565
+ "eval_samples_per_second": 3664.947,
1566
+ "eval_steps_per_second": 115.923,
1567
+ "step": 1025
1568
+ },
1569
+ {
1570
+ "epoch": 5.0,
1571
+ "learning_rate": 0.0,
1572
+ "loss": 0.003,
1573
+ "step": 1030
1574
  }
1575
  ],
1576
  "logging_steps": 5,
1577
  "max_steps": 1030,
1578
  "num_train_epochs": 5,
1579
  "save_steps": 500,
1580
+ "total_flos": 459260224208896.0,
1581
  "trial_name": null,
1582
  "trial_params": null
1583
  }