jflotz commited on
Commit
3e23b11
1 Parent(s): 4144d3a

Training in progress, step 100000

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e24722d3d71d5dada97b20d5f2001082a254811eb675e34eb93438f0ea062fd3
3
  size 50044689
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2756bd704606cd6c5c35ea5f45a21e975a94f7ac54bd0802ebe8750dfbd1eba8
3
  size 50044689
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8c6c3bb299aca29d8267ff6984670047e5650cb7a903f6095214235ebc45d18
3
  size 25761253
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fc703713a0fad50a78f7cc73423f660e122486f1451ea9412d49c8df9646af6
3
  size 25761253
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:057898351285142abb8c9f32929ef0d1d3ad74103ddc4be712f92726450c3465
3
  size 14503
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f94f403b594a29ecb6816cee93c65f5e3a0566d5747151b3697716adac4e9951
3
  size 14503
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36f11c380127c9b2248f3452b83eb7e7a6efc224b0b84b63651e3db0e819c91c
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d85ea74361bfabc4dca40ed2a4dec24f25124d91f625a1176acad7044d70175
3
  size 623
last-checkpoint/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.014989293361884,
5
- "global_step": 90000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1806,11 +1806,211 @@
1806
  "eval_samples_per_second": 1053.586,
1807
  "eval_steps_per_second": 16.512,
1808
  "step": 90000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1809
  }
1810
  ],
1811
  "max_steps": 250000,
1812
  "num_train_epochs": 12,
1813
- "total_flos": 1.4414834978635425e+21,
1814
  "trial_name": null,
1815
  "trial_params": null
1816
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.461099214846538,
5
+ "global_step": 100000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1806
  "eval_samples_per_second": 1053.586,
1807
  "eval_steps_per_second": 16.512,
1808
  "step": 90000
1809
+ },
1810
+ {
1811
+ "epoch": 4.04,
1812
+ "learning_rate": 0.00045642490670472436,
1813
+ "loss": 0.4084,
1814
+ "step": 90500
1815
+ },
1816
+ {
1817
+ "epoch": 4.06,
1818
+ "learning_rate": 0.0004547471653280225,
1819
+ "loss": 0.4078,
1820
+ "step": 91000
1821
+ },
1822
+ {
1823
+ "epoch": 4.06,
1824
+ "eval_loss": 0.37994059920310974,
1825
+ "eval_runtime": 2.2046,
1826
+ "eval_samples_per_second": 1041.911,
1827
+ "eval_steps_per_second": 16.329,
1828
+ "step": 91000
1829
+ },
1830
+ {
1831
+ "epoch": 4.08,
1832
+ "learning_rate": 0.00045306287352519543,
1833
+ "loss": 0.407,
1834
+ "step": 91500
1835
+ },
1836
+ {
1837
+ "epoch": 4.1,
1838
+ "learning_rate": 0.00045137210497262333,
1839
+ "loss": 0.4068,
1840
+ "step": 92000
1841
+ },
1842
+ {
1843
+ "epoch": 4.1,
1844
+ "eval_loss": 0.3793868124485016,
1845
+ "eval_runtime": 2.2708,
1846
+ "eval_samples_per_second": 1011.534,
1847
+ "eval_steps_per_second": 15.853,
1848
+ "step": 92000
1849
+ },
1850
+ {
1851
+ "epoch": 4.13,
1852
+ "learning_rate": 0.0004496749336299999,
1853
+ "loss": 0.4063,
1854
+ "step": 92500
1855
+ },
1856
+ {
1857
+ "epoch": 4.15,
1858
+ "learning_rate": 0.0004479714337370977,
1859
+ "loss": 0.4057,
1860
+ "step": 93000
1861
+ },
1862
+ {
1863
+ "epoch": 4.15,
1864
+ "eval_loss": 0.37835967540740967,
1865
+ "eval_runtime": 2.237,
1866
+ "eval_samples_per_second": 1026.834,
1867
+ "eval_steps_per_second": 16.093,
1868
+ "step": 93000
1869
+ },
1870
+ {
1871
+ "epoch": 4.17,
1872
+ "learning_rate": 0.00044626167981052036,
1873
+ "loss": 0.4052,
1874
+ "step": 93500
1875
+ },
1876
+ {
1877
+ "epoch": 4.19,
1878
+ "learning_rate": 0.00044454574664044404,
1879
+ "loss": 0.4047,
1880
+ "step": 94000
1881
+ },
1882
+ {
1883
+ "epoch": 4.19,
1884
+ "eval_loss": 0.37884432077407837,
1885
+ "eval_runtime": 2.2678,
1886
+ "eval_samples_per_second": 1012.88,
1887
+ "eval_steps_per_second": 15.874,
1888
+ "step": 94000
1889
+ },
1890
+ {
1891
+ "epoch": 4.22,
1892
+ "learning_rate": 0.000442823709287344,
1893
+ "loss": 0.4044,
1894
+ "step": 94500
1895
+ },
1896
+ {
1897
+ "epoch": 4.24,
1898
+ "learning_rate": 0.0004410956430787129,
1899
+ "loss": 0.4047,
1900
+ "step": 95000
1901
+ },
1902
+ {
1903
+ "epoch": 4.24,
1904
+ "eval_loss": 0.37695789337158203,
1905
+ "eval_runtime": 2.1458,
1906
+ "eval_samples_per_second": 1070.466,
1907
+ "eval_steps_per_second": 16.777,
1908
+ "step": 95000
1909
+ },
1910
+ {
1911
+ "epoch": 4.26,
1912
+ "learning_rate": 0.0004393616236057647,
1913
+ "loss": 0.4036,
1914
+ "step": 95500
1915
+ },
1916
+ {
1917
+ "epoch": 4.28,
1918
+ "learning_rate": 0.00043762172672012875,
1919
+ "loss": 0.4029,
1920
+ "step": 96000
1921
+ },
1922
+ {
1923
+ "epoch": 4.28,
1924
+ "eval_loss": 0.37500157952308655,
1925
+ "eval_runtime": 2.2301,
1926
+ "eval_samples_per_second": 1029.999,
1927
+ "eval_steps_per_second": 16.143,
1928
+ "step": 96000
1929
+ },
1930
+ {
1931
+ "epoch": 4.3,
1932
+ "learning_rate": 0.0004358760285305312,
1933
+ "loss": 0.4025,
1934
+ "step": 96500
1935
+ },
1936
+ {
1937
+ "epoch": 4.33,
1938
+ "learning_rate": 0.0004341246053994663,
1939
+ "loss": 0.4022,
1940
+ "step": 97000
1941
+ },
1942
+ {
1943
+ "epoch": 4.33,
1944
+ "eval_loss": 0.37471264600753784,
1945
+ "eval_runtime": 2.2149,
1946
+ "eval_samples_per_second": 1037.069,
1947
+ "eval_steps_per_second": 16.254,
1948
+ "step": 97000
1949
+ },
1950
+ {
1951
+ "epoch": 4.35,
1952
+ "learning_rate": 0.00043236753393985534,
1953
+ "loss": 0.4019,
1954
+ "step": 97500
1955
+ },
1956
+ {
1957
+ "epoch": 4.37,
1958
+ "learning_rate": 0.0004306048910116964,
1959
+ "loss": 0.4015,
1960
+ "step": 98000
1961
+ },
1962
+ {
1963
+ "epoch": 4.37,
1964
+ "eval_loss": 0.3735538125038147,
1965
+ "eval_runtime": 2.2361,
1966
+ "eval_samples_per_second": 1027.233,
1967
+ "eval_steps_per_second": 16.099,
1968
+ "step": 98000
1969
+ },
1970
+ {
1971
+ "epoch": 4.39,
1972
+ "learning_rate": 0.0004288367537187012,
1973
+ "loss": 0.4011,
1974
+ "step": 98500
1975
+ },
1976
+ {
1977
+ "epoch": 4.42,
1978
+ "learning_rate": 0.00042706319940492284,
1979
+ "loss": 0.4007,
1980
+ "step": 99000
1981
+ },
1982
+ {
1983
+ "epoch": 4.42,
1984
+ "eval_loss": 0.3751888573169708,
1985
+ "eval_runtime": 2.2595,
1986
+ "eval_samples_per_second": 1016.613,
1987
+ "eval_steps_per_second": 15.933,
1988
+ "step": 99000
1989
+ },
1990
+ {
1991
+ "epoch": 4.44,
1992
+ "learning_rate": 0.00042528430565137254,
1993
+ "loss": 0.4003,
1994
+ "step": 99500
1995
+ },
1996
+ {
1997
+ "epoch": 4.46,
1998
+ "learning_rate": 0.00042350015027262593,
1999
+ "loss": 0.4,
2000
+ "step": 100000
2001
+ },
2002
+ {
2003
+ "epoch": 4.46,
2004
+ "eval_loss": 0.37434616684913635,
2005
+ "eval_runtime": 2.2061,
2006
+ "eval_samples_per_second": 1041.22,
2007
+ "eval_steps_per_second": 16.319,
2008
+ "step": 100000
2009
  }
2010
  ],
2011
  "max_steps": 250000,
2012
  "num_train_epochs": 12,
2013
+ "total_flos": 1.6016515007293466e+21,
2014
  "trial_name": null,
2015
  "trial_params": null
2016
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f8c6c3bb299aca29d8267ff6984670047e5650cb7a903f6095214235ebc45d18
3
  size 25761253
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fc703713a0fad50a78f7cc73423f660e122486f1451ea9412d49c8df9646af6
3
  size 25761253