jflotz commited on
Commit
fc5afd9
1 Parent(s): 55d0152

Training in progress, step 1000000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb4cd9e789adbd1802119018bcfc4f0b6dba2541ced8918776537c19936d2aa3
3
  size 893439185
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3934f519240d590552d43746648c081056a7995bf6c44310ab67246f6ef8ad67
3
  size 893439185
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21e636c80ed6aaf4e2b5d21598685c1a08b0a8d8edf7041e56552898357162ca
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619263ccd39f733619bbbf55e178f9282f2d9680aa9481a120d8cd9e41fe0f1b
3
  size 449471589
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4c1caabd63b797b525c8b3557d0ed6bdcb32c060e6354cffd8a2f88412a58c50
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f12e8f0b7966c04954bff8e89ed067117d335fd21dca824245f60b5603214287
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2358905887cd0ce80c53b6e8a0174e039c4c5bd62c6c91c86f0312f9b46fcf7
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d806e9f9f09813043b95cbeda18b18cdfb60c100fbde3239bf79ee81c659dc36
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 11.041343698069438,
5
- "global_step": 990000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -19806,11 +19806,211 @@
19806
  "eval_samples_per_second": 878.327,
19807
  "eval_steps_per_second": 13.766,
19808
  "step": 990000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19809
  }
19810
  ],
19811
  "max_steps": 1000000,
19812
  "num_train_epochs": 12,
19813
- "total_flos": 6.9398656010816955e+22,
19814
  "trial_name": null,
19815
  "trial_params": null
19816
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 11.152872422292361,
5
+ "global_step": 1000000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
19806
  "eval_samples_per_second": 878.327,
19807
  "eval_steps_per_second": 13.766,
19808
  "step": 990000
19809
+ },
19810
+ {
19811
+ "epoch": 11.05,
19812
+ "learning_rate": 1.003454077439879e-05,
19813
+ "loss": 0.1795,
19814
+ "step": 990500
19815
+ },
19816
+ {
19817
+ "epoch": 11.05,
19818
+ "learning_rate": 1.0031000845556304e-05,
19819
+ "loss": 0.1792,
19820
+ "step": 991000
19821
+ },
19822
+ {
19823
+ "epoch": 11.05,
19824
+ "eval_loss": 0.17132483422756195,
19825
+ "eval_runtime": 2.6196,
19826
+ "eval_samples_per_second": 876.851,
19827
+ "eval_steps_per_second": 13.743,
19828
+ "step": 991000
19829
+ },
19830
+ {
19831
+ "epoch": 11.06,
19832
+ "learning_rate": 1.0027652209285743e-05,
19833
+ "loss": 0.1795,
19834
+ "step": 991500
19835
+ },
19836
+ {
19837
+ "epoch": 11.06,
19838
+ "learning_rate": 1.0024494874742152e-05,
19839
+ "loss": 0.1794,
19840
+ "step": 992000
19841
+ },
19842
+ {
19843
+ "epoch": 11.06,
19844
+ "eval_loss": 0.1712769716978073,
19845
+ "eval_runtime": 2.602,
19846
+ "eval_samples_per_second": 882.772,
19847
+ "eval_steps_per_second": 13.835,
19848
+ "step": 992000
19849
+ },
19850
+ {
19851
+ "epoch": 11.07,
19852
+ "learning_rate": 1.0021528850557572e-05,
19853
+ "loss": 0.1793,
19854
+ "step": 992500
19855
+ },
19856
+ {
19857
+ "epoch": 11.07,
19858
+ "learning_rate": 1.0018754144840986e-05,
19859
+ "loss": 0.1794,
19860
+ "step": 993000
19861
+ },
19862
+ {
19863
+ "epoch": 11.07,
19864
+ "eval_loss": 0.17019130289554596,
19865
+ "eval_runtime": 2.6352,
19866
+ "eval_samples_per_second": 871.66,
19867
+ "eval_steps_per_second": 13.661,
19868
+ "step": 993000
19869
+ },
19870
+ {
19871
+ "epoch": 11.08,
19872
+ "learning_rate": 1.0016170765178345e-05,
19873
+ "loss": 0.1796,
19874
+ "step": 993500
19875
+ },
19876
+ {
19877
+ "epoch": 11.09,
19878
+ "learning_rate": 1.0013778718632507e-05,
19879
+ "loss": 0.1795,
19880
+ "step": 994000
19881
+ },
19882
+ {
19883
+ "epoch": 11.09,
19884
+ "eval_loss": 0.16902120411396027,
19885
+ "eval_runtime": 2.6744,
19886
+ "eval_samples_per_second": 858.899,
19887
+ "eval_steps_per_second": 13.461,
19888
+ "step": 994000
19889
+ },
19890
+ {
19891
+ "epoch": 11.09,
19892
+ "learning_rate": 1.0011578011743233e-05,
19893
+ "loss": 0.1794,
19894
+ "step": 994500
19895
+ },
19896
+ {
19897
+ "epoch": 11.1,
19898
+ "learning_rate": 1.000956865052717e-05,
19899
+ "loss": 0.1795,
19900
+ "step": 995000
19901
+ },
19902
+ {
19903
+ "epoch": 11.1,
19904
+ "eval_loss": 0.17112106084823608,
19905
+ "eval_runtime": 2.6298,
19906
+ "eval_samples_per_second": 873.44,
19907
+ "eval_steps_per_second": 13.689,
19908
+ "step": 995000
19909
+ },
19910
+ {
19911
+ "epoch": 11.1,
19912
+ "learning_rate": 1.0007750640477843e-05,
19913
+ "loss": 0.1797,
19914
+ "step": 995500
19915
+ },
19916
+ {
19917
+ "epoch": 11.11,
19918
+ "learning_rate": 1.0006123986565623e-05,
19919
+ "loss": 0.1797,
19920
+ "step": 996000
19921
+ },
19922
+ {
19923
+ "epoch": 11.11,
19924
+ "eval_loss": 0.17197231948375702,
19925
+ "eval_runtime": 2.6674,
19926
+ "eval_samples_per_second": 861.138,
19927
+ "eval_steps_per_second": 13.496,
19928
+ "step": 996000
19929
+ },
19930
+ {
19931
+ "epoch": 11.11,
19932
+ "learning_rate": 1.0004688693237708e-05,
19933
+ "loss": 0.179,
19934
+ "step": 996500
19935
+ },
19936
+ {
19937
+ "epoch": 11.12,
19938
+ "learning_rate": 1.0003444764418138e-05,
19939
+ "loss": 0.1795,
19940
+ "step": 997000
19941
+ },
19942
+ {
19943
+ "epoch": 11.12,
19944
+ "eval_loss": 0.16935667395591736,
19945
+ "eval_runtime": 2.6744,
19946
+ "eval_samples_per_second": 858.882,
19947
+ "eval_steps_per_second": 13.461,
19948
+ "step": 997000
19949
+ },
19950
+ {
19951
+ "epoch": 11.12,
19952
+ "learning_rate": 1.0002392203507781e-05,
19953
+ "loss": 0.1796,
19954
+ "step": 997500
19955
+ },
19956
+ {
19957
+ "epoch": 11.13,
19958
+ "learning_rate": 1.000153101338428e-05,
19959
+ "loss": 0.1794,
19960
+ "step": 998000
19961
+ },
19962
+ {
19963
+ "epoch": 11.13,
19964
+ "eval_loss": 0.16944564878940582,
19965
+ "eval_runtime": 2.6058,
19966
+ "eval_samples_per_second": 881.508,
19967
+ "eval_steps_per_second": 13.816,
19968
+ "step": 998000
19969
+ },
19970
+ {
19971
+ "epoch": 11.14,
19972
+ "learning_rate": 1.00008611964021e-05,
19973
+ "loss": 0.1795,
19974
+ "step": 998500
19975
+ },
19976
+ {
19977
+ "epoch": 11.14,
19978
+ "learning_rate": 1.00003827543925e-05,
19979
+ "loss": 0.1797,
19980
+ "step": 999000
19981
+ },
19982
+ {
19983
+ "epoch": 11.14,
19984
+ "eval_loss": 0.1695910096168518,
19985
+ "eval_runtime": 2.6979,
19986
+ "eval_samples_per_second": 851.388,
19987
+ "eval_steps_per_second": 13.343,
19988
+ "step": 999000
19989
+ },
19990
+ {
19991
+ "epoch": 11.15,
19992
+ "learning_rate": 1.0000095688663532e-05,
19993
+ "loss": 0.1796,
19994
+ "step": 999500
19995
+ },
19996
+ {
19997
+ "epoch": 11.15,
19998
+ "learning_rate": 1e-05,
19999
+ "loss": 0.1796,
20000
+ "step": 1000000
20001
+ },
20002
+ {
20003
+ "epoch": 11.15,
20004
+ "eval_loss": 0.16828955709934235,
20005
+ "eval_runtime": 2.6549,
20006
+ "eval_samples_per_second": 865.189,
20007
+ "eval_steps_per_second": 13.56,
20008
+ "step": 1000000
20009
  }
20010
  ],
20011
  "max_steps": 1000000,
20012
  "num_train_epochs": 12,
20013
+ "total_flos": 7.009965862112043e+22,
20014
  "trial_name": null,
20015
  "trial_params": null
20016
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21e636c80ed6aaf4e2b5d21598685c1a08b0a8d8edf7041e56552898357162ca
3
  size 449471589
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619263ccd39f733619bbbf55e178f9282f2d9680aa9481a120d8cd9e41fe0f1b
3
  size 449471589