YufeiWeng commited on
Commit
e787da3
1 Parent(s): bd5042f

End of training

Browse files
README.md CHANGED
@@ -1,6 +1,8 @@
1
  ---
2
  base_model: microsoft/dit-base-finetuned-rvlcdip
3
  tags:
 
 
4
  - generated_from_trainer
5
  metrics:
6
  - f1
@@ -16,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [microsoft/dit-base-finetuned-rvlcdip](https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.0539
20
- - F1: 0.6108
21
 
22
  ## Model description
23
 
 
1
  ---
2
  base_model: microsoft/dit-base-finetuned-rvlcdip
3
  tags:
4
+ - image-classification
5
+ - vision
6
  - generated_from_trainer
7
  metrics:
8
  - f1
 
18
 
19
  This model is a fine-tuned version of [microsoft/dit-base-finetuned-rvlcdip](https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.0404
22
+ - F1: 0.6134
23
 
24
  ## Model description
25
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.5427922241858116,
3
- "eval_f1": 0.6095153739086423,
4
- "eval_loss": 0.05489746853709221,
5
- "eval_runtime": 1141.5758,
6
- "eval_samples_per_second": 180.682,
7
- "eval_steps_per_second": 2.823,
8
- "total_flos": 1.0663947529637069e+19,
9
- "train_loss": 0.08201153971428095,
10
- "train_runtime": 47443.3166,
11
- "train_samples_per_second": 13.49,
12
- "train_steps_per_second": 0.211
13
  }
 
1
  {
2
+ "epoch": 0.6942691239585963,
3
+ "eval_f1": 0.6133951445650848,
4
+ "eval_loss": 0.04044894501566887,
5
+ "eval_runtime": 1162.523,
6
+ "eval_samples_per_second": 177.426,
7
+ "eval_steps_per_second": 2.772,
8
+ "total_flos": 1.3639932886745088e+19,
9
+ "train_loss": 0.019194319985129618,
10
+ "train_runtime": 18605.0451,
11
+ "train_samples_per_second": 34.399,
12
+ "train_steps_per_second": 0.537
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.5427922241858116,
3
- "eval_f1": 0.6095153739086423,
4
- "eval_loss": 0.05489746853709221,
5
- "eval_runtime": 1141.5758,
6
- "eval_samples_per_second": 180.682,
7
- "eval_steps_per_second": 2.823
8
  }
 
1
  {
2
+ "epoch": 0.6942691239585963,
3
+ "eval_f1": 0.6133951445650848,
4
+ "eval_loss": 0.04044894501566887,
5
+ "eval_runtime": 1162.523,
6
+ "eval_samples_per_second": 177.426,
7
+ "eval_steps_per_second": 2.772
8
  }
p_object.json CHANGED
The diff for this file is too large to render. See raw diff
 
prediction_reference.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.5427922241858116,
3
- "total_flos": 1.0663947529637069e+19,
4
- "train_loss": 0.08201153971428095,
5
- "train_runtime": 47443.3166,
6
- "train_samples_per_second": 13.49,
7
- "train_steps_per_second": 0.211
8
  }
 
1
  {
2
+ "epoch": 0.6942691239585963,
3
+ "total_flos": 1.3639932886745088e+19,
4
+ "train_loss": 0.019194319985129618,
5
+ "train_runtime": 18605.0451,
6
+ "train_samples_per_second": 34.399,
7
+ "train_steps_per_second": 0.537
8
  }
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.6095153739086423,
3
- "best_model_checkpoint": "./step_test_microsoft_dit/checkpoint-2000",
4
- "epoch": 0.5427922241858116,
5
  "eval_steps": 50,
6
- "global_step": 2150,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1770,144 +1770,672 @@
1770
  },
1771
  {
1772
  "epoch": 0.5074476142388286,
1773
- "grad_norm": 0.604178786277771,
1774
  "learning_rate": 2.3970000000000003e-05,
1775
- "loss": 0.0551,
1776
  "step": 2010
1777
  },
1778
  {
1779
  "epoch": 0.5099722292350417,
1780
- "grad_norm": 0.6526350378990173,
1781
  "learning_rate": 2.394e-05,
1782
- "loss": 0.0734,
1783
  "step": 2020
1784
  },
1785
  {
1786
  "epoch": 0.5124968442312547,
1787
- "grad_norm": 0.8096711039543152,
1788
  "learning_rate": 2.3910000000000003e-05,
1789
- "loss": 0.0724,
1790
  "step": 2030
1791
  },
1792
  {
1793
  "epoch": 0.5150214592274678,
1794
- "grad_norm": 1.262484073638916,
1795
  "learning_rate": 2.3880000000000002e-05,
1796
- "loss": 0.0949,
1797
  "step": 2040
1798
  },
1799
  {
1800
  "epoch": 0.5175460742236809,
1801
- "grad_norm": 0.8815634846687317,
1802
  "learning_rate": 2.385e-05,
1803
- "loss": 0.0706,
1804
  "step": 2050
1805
  },
1806
  {
1807
  "epoch": 0.5175460742236809,
1808
- "eval_f1": 0.6041730781067275,
1809
- "eval_loss": 0.04226996377110481,
1810
- "eval_runtime": 1154.7019,
1811
- "eval_samples_per_second": 178.628,
1812
- "eval_steps_per_second": 2.791,
1813
  "step": 2050
1814
  },
1815
  {
1816
  "epoch": 0.520070689219894,
1817
- "grad_norm": 0.6676633954048157,
1818
  "learning_rate": 2.3820000000000002e-05,
1819
- "loss": 0.0831,
1820
  "step": 2060
1821
  },
1822
  {
1823
  "epoch": 0.522595304216107,
1824
- "grad_norm": 0.9431056976318359,
1825
  "learning_rate": 2.379e-05,
1826
- "loss": 0.0762,
1827
  "step": 2070
1828
  },
1829
  {
1830
  "epoch": 0.5251199192123202,
1831
- "grad_norm": 0.8600429892539978,
1832
  "learning_rate": 2.3760000000000003e-05,
1833
- "loss": 0.0674,
1834
  "step": 2080
1835
  },
1836
  {
1837
  "epoch": 0.5276445342085332,
1838
- "grad_norm": 1.0786969661712646,
1839
  "learning_rate": 2.373e-05,
1840
- "loss": 0.0688,
1841
  "step": 2090
1842
  },
1843
  {
1844
  "epoch": 0.5301691492047462,
1845
- "grad_norm": 0.6463090181350708,
1846
  "learning_rate": 2.37e-05,
1847
- "loss": 0.0647,
1848
  "step": 2100
1849
  },
1850
  {
1851
  "epoch": 0.5301691492047462,
1852
- "eval_f1": 0.6056007895386134,
1853
- "eval_loss": 0.04632845148444176,
1854
- "eval_runtime": 1152.9435,
1855
- "eval_samples_per_second": 178.9,
1856
- "eval_steps_per_second": 2.795,
1857
  "step": 2100
1858
  },
1859
  {
1860
  "epoch": 0.5326937642009594,
1861
- "grad_norm": 1.0313136577606201,
1862
  "learning_rate": 2.3670000000000002e-05,
1863
- "loss": 0.0673,
1864
  "step": 2110
1865
  },
1866
  {
1867
  "epoch": 0.5352183791971724,
1868
- "grad_norm": 1.19906485080719,
1869
  "learning_rate": 2.364e-05,
1870
- "loss": 0.0722,
1871
  "step": 2120
1872
  },
1873
  {
1874
  "epoch": 0.5377429941933856,
1875
- "grad_norm": 0.8951911926269531,
1876
  "learning_rate": 2.3610000000000003e-05,
1877
- "loss": 0.0801,
1878
  "step": 2130
1879
  },
1880
  {
1881
  "epoch": 0.5402676091895986,
1882
- "grad_norm": 0.856438159942627,
1883
  "learning_rate": 2.358e-05,
1884
- "loss": 0.0664,
1885
  "step": 2140
1886
  },
1887
  {
1888
  "epoch": 0.5427922241858116,
1889
- "grad_norm": 0.74139404296875,
1890
  "learning_rate": 2.3550000000000003e-05,
1891
- "loss": 0.0626,
1892
  "step": 2150
1893
  },
1894
  {
1895
  "epoch": 0.5427922241858116,
1896
- "eval_f1": 0.6048198696667897,
1897
- "eval_loss": 0.047770071774721146,
1898
- "eval_runtime": 1156.5853,
1899
- "eval_samples_per_second": 178.337,
1900
- "eval_steps_per_second": 2.787,
1901
  "step": 2150
1902
  },
1903
  {
1904
- "epoch": 0.5427922241858116,
1905
- "step": 2150,
1906
- "total_flos": 1.0663947529637069e+19,
1907
- "train_loss": 0.08201153971428095,
1908
- "train_runtime": 47443.3166,
1909
- "train_samples_per_second": 13.49,
1910
- "train_steps_per_second": 0.211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1911
  }
1912
  ],
1913
  "logging_steps": 10,
@@ -1936,7 +2464,7 @@
1936
  "attributes": {}
1937
  }
1938
  },
1939
- "total_flos": 1.0663947529637069e+19,
1940
  "train_batch_size": 64,
1941
  "trial_name": null,
1942
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.6133951445650848,
3
+ "best_model_checkpoint": "./step_test_microsoft_dit/checkpoint-2500",
4
+ "epoch": 0.6942691239585963,
5
  "eval_steps": 50,
6
+ "global_step": 2750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1770
  },
1771
  {
1772
  "epoch": 0.5074476142388286,
1773
+ "grad_norm": 0.7863900065422058,
1774
  "learning_rate": 2.3970000000000003e-05,
1775
+ "loss": 0.061,
1776
  "step": 2010
1777
  },
1778
  {
1779
  "epoch": 0.5099722292350417,
1780
+ "grad_norm": 1.0800750255584717,
1781
  "learning_rate": 2.394e-05,
1782
+ "loss": 0.0781,
1783
  "step": 2020
1784
  },
1785
  {
1786
  "epoch": 0.5124968442312547,
1787
+ "grad_norm": 1.0992929935455322,
1788
  "learning_rate": 2.3910000000000003e-05,
1789
+ "loss": 0.0694,
1790
  "step": 2030
1791
  },
1792
  {
1793
  "epoch": 0.5150214592274678,
1794
+ "grad_norm": 0.703554093837738,
1795
  "learning_rate": 2.3880000000000002e-05,
1796
+ "loss": 0.0881,
1797
  "step": 2040
1798
  },
1799
  {
1800
  "epoch": 0.5175460742236809,
1801
+ "grad_norm": 1.214089274406433,
1802
  "learning_rate": 2.385e-05,
1803
+ "loss": 0.0736,
1804
  "step": 2050
1805
  },
1806
  {
1807
  "epoch": 0.5175460742236809,
1808
+ "eval_f1": 0.612187690432663,
1809
+ "eval_loss": 0.05384594947099686,
1810
+ "eval_runtime": 1155.7771,
1811
+ "eval_samples_per_second": 178.462,
1812
+ "eval_steps_per_second": 2.789,
1813
  "step": 2050
1814
  },
1815
  {
1816
  "epoch": 0.520070689219894,
1817
+ "grad_norm": 0.8359307050704956,
1818
  "learning_rate": 2.3820000000000002e-05,
1819
+ "loss": 0.0759,
1820
  "step": 2060
1821
  },
1822
  {
1823
  "epoch": 0.522595304216107,
1824
+ "grad_norm": 1.6299511194229126,
1825
  "learning_rate": 2.379e-05,
1826
+ "loss": 0.076,
1827
  "step": 2070
1828
  },
1829
  {
1830
  "epoch": 0.5251199192123202,
1831
+ "grad_norm": 0.6880617737770081,
1832
  "learning_rate": 2.3760000000000003e-05,
1833
+ "loss": 0.0745,
1834
  "step": 2080
1835
  },
1836
  {
1837
  "epoch": 0.5276445342085332,
1838
+ "grad_norm": 0.7822777032852173,
1839
  "learning_rate": 2.373e-05,
1840
+ "loss": 0.0697,
1841
  "step": 2090
1842
  },
1843
  {
1844
  "epoch": 0.5301691492047462,
1845
+ "grad_norm": 0.7941886782646179,
1846
  "learning_rate": 2.37e-05,
1847
+ "loss": 0.0685,
1848
  "step": 2100
1849
  },
1850
  {
1851
  "epoch": 0.5301691492047462,
1852
+ "eval_f1": 0.6104315862855695,
1853
+ "eval_loss": 0.04854836314916611,
1854
+ "eval_runtime": 1154.0649,
1855
+ "eval_samples_per_second": 178.727,
1856
+ "eval_steps_per_second": 2.793,
1857
  "step": 2100
1858
  },
1859
  {
1860
  "epoch": 0.5326937642009594,
1861
+ "grad_norm": 0.948130190372467,
1862
  "learning_rate": 2.3670000000000002e-05,
1863
+ "loss": 0.0706,
1864
  "step": 2110
1865
  },
1866
  {
1867
  "epoch": 0.5352183791971724,
1868
+ "grad_norm": 0.959032416343689,
1869
  "learning_rate": 2.364e-05,
1870
+ "loss": 0.0684,
1871
  "step": 2120
1872
  },
1873
  {
1874
  "epoch": 0.5377429941933856,
1875
+ "grad_norm": 1.1859666109085083,
1876
  "learning_rate": 2.3610000000000003e-05,
1877
+ "loss": 0.0757,
1878
  "step": 2130
1879
  },
1880
  {
1881
  "epoch": 0.5402676091895986,
1882
+ "grad_norm": 0.9001142978668213,
1883
  "learning_rate": 2.358e-05,
1884
+ "loss": 0.079,
1885
  "step": 2140
1886
  },
1887
  {
1888
  "epoch": 0.5427922241858116,
1889
+ "grad_norm": 0.47399717569351196,
1890
  "learning_rate": 2.3550000000000003e-05,
1891
+ "loss": 0.0726,
1892
  "step": 2150
1893
  },
1894
  {
1895
  "epoch": 0.5427922241858116,
1896
+ "eval_f1": 0.611992731677771,
1897
+ "eval_loss": 0.05662121623754501,
1898
+ "eval_runtime": 1151.3771,
1899
+ "eval_samples_per_second": 179.144,
1900
+ "eval_steps_per_second": 2.799,
1901
  "step": 2150
1902
  },
1903
  {
1904
+ "epoch": 0.5453168391820248,
1905
+ "grad_norm": 0.6292353272438049,
1906
+ "learning_rate": 2.3520000000000002e-05,
1907
+ "loss": 0.0677,
1908
+ "step": 2160
1909
+ },
1910
+ {
1911
+ "epoch": 0.5478414541782378,
1912
+ "grad_norm": 0.7090362906455994,
1913
+ "learning_rate": 2.349e-05,
1914
+ "loss": 0.0703,
1915
+ "step": 2170
1916
+ },
1917
+ {
1918
+ "epoch": 0.5503660691744509,
1919
+ "grad_norm": 0.6082953810691833,
1920
+ "learning_rate": 2.3460000000000002e-05,
1921
+ "loss": 0.0672,
1922
+ "step": 2180
1923
+ },
1924
+ {
1925
+ "epoch": 0.552890684170664,
1926
+ "grad_norm": 0.5937643051147461,
1927
+ "learning_rate": 2.343e-05,
1928
+ "loss": 0.0686,
1929
+ "step": 2190
1930
+ },
1931
+ {
1932
+ "epoch": 0.555415299166877,
1933
+ "grad_norm": 0.7394770979881287,
1934
+ "learning_rate": 2.3400000000000003e-05,
1935
+ "loss": 0.0731,
1936
+ "step": 2200
1937
+ },
1938
+ {
1939
+ "epoch": 0.555415299166877,
1940
+ "eval_f1": 0.6111780293905084,
1941
+ "eval_loss": 0.05852247402071953,
1942
+ "eval_runtime": 1153.1003,
1943
+ "eval_samples_per_second": 178.876,
1944
+ "eval_steps_per_second": 2.795,
1945
+ "step": 2200
1946
+ },
1947
+ {
1948
+ "epoch": 0.5579399141630901,
1949
+ "grad_norm": 0.7641323804855347,
1950
+ "learning_rate": 2.337e-05,
1951
+ "loss": 0.0732,
1952
+ "step": 2210
1953
+ },
1954
+ {
1955
+ "epoch": 0.5604645291593032,
1956
+ "grad_norm": 0.8567935824394226,
1957
+ "learning_rate": 2.334e-05,
1958
+ "loss": 0.0599,
1959
+ "step": 2220
1960
+ },
1961
+ {
1962
+ "epoch": 0.5629891441555163,
1963
+ "grad_norm": 0.9106941819190979,
1964
+ "learning_rate": 2.3310000000000002e-05,
1965
+ "loss": 0.0593,
1966
+ "step": 2230
1967
+ },
1968
+ {
1969
+ "epoch": 0.5655137591517294,
1970
+ "grad_norm": 1.5944632291793823,
1971
+ "learning_rate": 2.328e-05,
1972
+ "loss": 0.0669,
1973
+ "step": 2240
1974
+ },
1975
+ {
1976
+ "epoch": 0.5680383741479424,
1977
+ "grad_norm": 0.9120457768440247,
1978
+ "learning_rate": 2.3250000000000003e-05,
1979
+ "loss": 0.0722,
1980
+ "step": 2250
1981
+ },
1982
+ {
1983
+ "epoch": 0.5680383741479424,
1984
+ "eval_f1": 0.6139676730710583,
1985
+ "eval_loss": 0.05887339636683464,
1986
+ "eval_runtime": 1155.9087,
1987
+ "eval_samples_per_second": 178.441,
1988
+ "eval_steps_per_second": 2.788,
1989
+ "step": 2250
1990
+ },
1991
+ {
1992
+ "epoch": 0.5705629891441555,
1993
+ "grad_norm": 0.8505953550338745,
1994
+ "learning_rate": 2.322e-05,
1995
+ "loss": 0.0863,
1996
+ "step": 2260
1997
+ },
1998
+ {
1999
+ "epoch": 0.5730876041403686,
2000
+ "grad_norm": 0.9573137164115906,
2001
+ "learning_rate": 2.319e-05,
2002
+ "loss": 0.0712,
2003
+ "step": 2270
2004
+ },
2005
+ {
2006
+ "epoch": 0.5756122191365817,
2007
+ "grad_norm": 1.230735182762146,
2008
+ "learning_rate": 2.3160000000000002e-05,
2009
+ "loss": 0.0677,
2010
+ "step": 2280
2011
+ },
2012
+ {
2013
+ "epoch": 0.5781368341327947,
2014
+ "grad_norm": 1.203621745109558,
2015
+ "learning_rate": 2.313e-05,
2016
+ "loss": 0.0634,
2017
+ "step": 2290
2018
+ },
2019
+ {
2020
+ "epoch": 0.5806614491290079,
2021
+ "grad_norm": 1.3590195178985596,
2022
+ "learning_rate": 2.3100000000000002e-05,
2023
+ "loss": 0.0819,
2024
+ "step": 2300
2025
+ },
2026
+ {
2027
+ "epoch": 0.5806614491290079,
2028
+ "eval_f1": 0.6121980676328502,
2029
+ "eval_loss": 0.050494007766246796,
2030
+ "eval_runtime": 1153.6589,
2031
+ "eval_samples_per_second": 178.789,
2032
+ "eval_steps_per_second": 2.794,
2033
+ "step": 2300
2034
+ },
2035
+ {
2036
+ "epoch": 0.5831860641252209,
2037
+ "grad_norm": 0.8538402318954468,
2038
+ "learning_rate": 2.307e-05,
2039
+ "loss": 0.0674,
2040
+ "step": 2310
2041
+ },
2042
+ {
2043
+ "epoch": 0.5857106791214339,
2044
+ "grad_norm": 1.1863012313842773,
2045
+ "learning_rate": 2.304e-05,
2046
+ "loss": 0.0665,
2047
+ "step": 2320
2048
+ },
2049
+ {
2050
+ "epoch": 0.5882352941176471,
2051
+ "grad_norm": 1.0120714902877808,
2052
+ "learning_rate": 2.301e-05,
2053
+ "loss": 0.0675,
2054
+ "step": 2330
2055
+ },
2056
+ {
2057
+ "epoch": 0.5907599091138601,
2058
+ "grad_norm": 0.8394482135772705,
2059
+ "learning_rate": 2.298e-05,
2060
+ "loss": 0.0812,
2061
+ "step": 2340
2062
+ },
2063
+ {
2064
+ "epoch": 0.5932845241100733,
2065
+ "grad_norm": 0.8855767250061035,
2066
+ "learning_rate": 2.2950000000000002e-05,
2067
+ "loss": 0.0694,
2068
+ "step": 2350
2069
+ },
2070
+ {
2071
+ "epoch": 0.5932845241100733,
2072
+ "eval_f1": 0.6101251634597422,
2073
+ "eval_loss": 0.053731031715869904,
2074
+ "eval_runtime": 1147.8424,
2075
+ "eval_samples_per_second": 179.695,
2076
+ "eval_steps_per_second": 2.808,
2077
+ "step": 2350
2078
+ },
2079
+ {
2080
+ "epoch": 0.5958091391062863,
2081
+ "grad_norm": 1.241045594215393,
2082
+ "learning_rate": 2.292e-05,
2083
+ "loss": 0.0646,
2084
+ "step": 2360
2085
+ },
2086
+ {
2087
+ "epoch": 0.5983337541024993,
2088
+ "grad_norm": 2.065401315689087,
2089
+ "learning_rate": 2.289e-05,
2090
+ "loss": 0.0792,
2091
+ "step": 2370
2092
+ },
2093
+ {
2094
+ "epoch": 0.6008583690987125,
2095
+ "grad_norm": 1.0024877786636353,
2096
+ "learning_rate": 2.286e-05,
2097
+ "loss": 0.0751,
2098
+ "step": 2380
2099
+ },
2100
+ {
2101
+ "epoch": 0.6033829840949255,
2102
+ "grad_norm": 0.4943256080150604,
2103
+ "learning_rate": 2.283e-05,
2104
+ "loss": 0.076,
2105
+ "step": 2390
2106
+ },
2107
+ {
2108
+ "epoch": 0.6059075990911386,
2109
+ "grad_norm": 1.0907814502716064,
2110
+ "learning_rate": 2.2800000000000002e-05,
2111
+ "loss": 0.0705,
2112
+ "step": 2400
2113
+ },
2114
+ {
2115
+ "epoch": 0.6059075990911386,
2116
+ "eval_f1": 0.6130196664177247,
2117
+ "eval_loss": 0.06461644172668457,
2118
+ "eval_runtime": 1149.8253,
2119
+ "eval_samples_per_second": 179.386,
2120
+ "eval_steps_per_second": 2.803,
2121
+ "step": 2400
2122
+ },
2123
+ {
2124
+ "epoch": 0.6084322140873517,
2125
+ "grad_norm": 1.1304162740707397,
2126
+ "learning_rate": 2.277e-05,
2127
+ "loss": 0.0548,
2128
+ "step": 2410
2129
+ },
2130
+ {
2131
+ "epoch": 0.6109568290835647,
2132
+ "grad_norm": 1.3394097089767456,
2133
+ "learning_rate": 2.274e-05,
2134
+ "loss": 0.0607,
2135
+ "step": 2420
2136
+ },
2137
+ {
2138
+ "epoch": 0.6134814440797778,
2139
+ "grad_norm": 0.5467960834503174,
2140
+ "learning_rate": 2.271e-05,
2141
+ "loss": 0.0701,
2142
+ "step": 2430
2143
+ },
2144
+ {
2145
+ "epoch": 0.6160060590759909,
2146
+ "grad_norm": 0.5510517954826355,
2147
+ "learning_rate": 2.268e-05,
2148
+ "loss": 0.0725,
2149
+ "step": 2440
2150
+ },
2151
+ {
2152
+ "epoch": 0.618530674072204,
2153
+ "grad_norm": 0.7682734131813049,
2154
+ "learning_rate": 2.265e-05,
2155
+ "loss": 0.0702,
2156
+ "step": 2450
2157
+ },
2158
+ {
2159
+ "epoch": 0.618530674072204,
2160
+ "eval_f1": 0.6124447065762312,
2161
+ "eval_loss": 0.046234920620918274,
2162
+ "eval_runtime": 1146.4615,
2163
+ "eval_samples_per_second": 179.912,
2164
+ "eval_steps_per_second": 2.811,
2165
+ "step": 2450
2166
+ },
2167
+ {
2168
+ "epoch": 0.6210552890684171,
2169
+ "grad_norm": 0.7578818798065186,
2170
+ "learning_rate": 2.262e-05,
2171
+ "loss": 0.0703,
2172
+ "step": 2460
2173
+ },
2174
+ {
2175
+ "epoch": 0.6235799040646302,
2176
+ "grad_norm": 0.7244108319282532,
2177
+ "learning_rate": 2.2590000000000002e-05,
2178
+ "loss": 0.0635,
2179
+ "step": 2470
2180
+ },
2181
+ {
2182
+ "epoch": 0.6261045190608432,
2183
+ "grad_norm": 1.1047908067703247,
2184
+ "learning_rate": 2.256e-05,
2185
+ "loss": 0.0614,
2186
+ "step": 2480
2187
+ },
2188
+ {
2189
+ "epoch": 0.6286291340570563,
2190
+ "grad_norm": 1.0824987888336182,
2191
+ "learning_rate": 2.253e-05,
2192
+ "loss": 0.081,
2193
+ "step": 2490
2194
+ },
2195
+ {
2196
+ "epoch": 0.6311537490532694,
2197
+ "grad_norm": 1.9344598054885864,
2198
+ "learning_rate": 2.25e-05,
2199
+ "loss": 0.0709,
2200
+ "step": 2500
2201
+ },
2202
+ {
2203
+ "epoch": 0.6311537490532694,
2204
+ "eval_f1": 0.6133951445650848,
2205
+ "eval_loss": 0.04044894501566887,
2206
+ "eval_runtime": 1148.0724,
2207
+ "eval_samples_per_second": 179.659,
2208
+ "eval_steps_per_second": 2.807,
2209
+ "step": 2500
2210
+ },
2211
+ {
2212
+ "epoch": 0.6336783640494824,
2213
+ "grad_norm": 1.2797091007232666,
2214
+ "learning_rate": 2.247e-05,
2215
+ "loss": 0.072,
2216
+ "step": 2510
2217
+ },
2218
+ {
2219
+ "epoch": 0.6362029790456956,
2220
+ "grad_norm": 0.7228933572769165,
2221
+ "learning_rate": 2.2440000000000002e-05,
2222
+ "loss": 0.071,
2223
+ "step": 2520
2224
+ },
2225
+ {
2226
+ "epoch": 0.6387275940419086,
2227
+ "grad_norm": 0.9655591249465942,
2228
+ "learning_rate": 2.241e-05,
2229
+ "loss": 0.0611,
2230
+ "step": 2530
2231
+ },
2232
+ {
2233
+ "epoch": 0.6412522090381216,
2234
+ "grad_norm": 0.9924450516700745,
2235
+ "learning_rate": 2.238e-05,
2236
+ "loss": 0.0676,
2237
+ "step": 2540
2238
+ },
2239
+ {
2240
+ "epoch": 0.6437768240343348,
2241
+ "grad_norm": 1.12591552734375,
2242
+ "learning_rate": 2.235e-05,
2243
+ "loss": 0.0804,
2244
+ "step": 2550
2245
+ },
2246
+ {
2247
+ "epoch": 0.6437768240343348,
2248
+ "eval_f1": 0.612305676335696,
2249
+ "eval_loss": 0.04778852313756943,
2250
+ "eval_runtime": 1160.4576,
2251
+ "eval_samples_per_second": 177.742,
2252
+ "eval_steps_per_second": 2.777,
2253
+ "step": 2550
2254
+ },
2255
+ {
2256
+ "epoch": 0.6463014390305478,
2257
+ "grad_norm": 0.7478006482124329,
2258
+ "learning_rate": 2.232e-05,
2259
+ "loss": 0.0638,
2260
+ "step": 2560
2261
+ },
2262
+ {
2263
+ "epoch": 0.648826054026761,
2264
+ "grad_norm": 0.7661213874816895,
2265
+ "learning_rate": 2.2290000000000002e-05,
2266
+ "loss": 0.0632,
2267
+ "step": 2570
2268
+ },
2269
+ {
2270
+ "epoch": 0.651350669022974,
2271
+ "grad_norm": 0.9824168086051941,
2272
+ "learning_rate": 2.226e-05,
2273
+ "loss": 0.0602,
2274
+ "step": 2580
2275
+ },
2276
+ {
2277
+ "epoch": 0.653875284019187,
2278
+ "grad_norm": 1.1700901985168457,
2279
+ "learning_rate": 2.223e-05,
2280
+ "loss": 0.0714,
2281
+ "step": 2590
2282
+ },
2283
+ {
2284
+ "epoch": 0.6563998990154002,
2285
+ "grad_norm": 0.8846214413642883,
2286
+ "learning_rate": 2.22e-05,
2287
+ "loss": 0.0666,
2288
+ "step": 2600
2289
+ },
2290
+ {
2291
+ "epoch": 0.6563998990154002,
2292
+ "eval_f1": 0.6104417670682731,
2293
+ "eval_loss": 0.04546576738357544,
2294
+ "eval_runtime": 1160.1326,
2295
+ "eval_samples_per_second": 177.792,
2296
+ "eval_steps_per_second": 2.778,
2297
+ "step": 2600
2298
+ },
2299
+ {
2300
+ "epoch": 0.6589245140116132,
2301
+ "grad_norm": 0.7641239166259766,
2302
+ "learning_rate": 2.217e-05,
2303
+ "loss": 0.058,
2304
+ "step": 2610
2305
+ },
2306
+ {
2307
+ "epoch": 0.6614491290078263,
2308
+ "grad_norm": 0.5828648209571838,
2309
+ "learning_rate": 2.214e-05,
2310
+ "loss": 0.0686,
2311
+ "step": 2620
2312
+ },
2313
+ {
2314
+ "epoch": 0.6639737440040394,
2315
+ "grad_norm": 0.6906914710998535,
2316
+ "learning_rate": 2.211e-05,
2317
+ "loss": 0.0764,
2318
+ "step": 2630
2319
+ },
2320
+ {
2321
+ "epoch": 0.6664983590002524,
2322
+ "grad_norm": 1.3137489557266235,
2323
+ "learning_rate": 2.208e-05,
2324
+ "loss": 0.0768,
2325
+ "step": 2640
2326
+ },
2327
+ {
2328
+ "epoch": 0.6690229739964655,
2329
+ "grad_norm": 0.863865077495575,
2330
+ "learning_rate": 2.205e-05,
2331
+ "loss": 0.0749,
2332
+ "step": 2650
2333
+ },
2334
+ {
2335
+ "epoch": 0.6690229739964655,
2336
+ "eval_f1": 0.6131900703964431,
2337
+ "eval_loss": 0.04790908098220825,
2338
+ "eval_runtime": 1162.4462,
2339
+ "eval_samples_per_second": 177.438,
2340
+ "eval_steps_per_second": 2.773,
2341
+ "step": 2650
2342
+ },
2343
+ {
2344
+ "epoch": 0.6715475889926786,
2345
+ "grad_norm": 0.9182652235031128,
2346
+ "learning_rate": 2.202e-05,
2347
+ "loss": 0.0625,
2348
+ "step": 2660
2349
+ },
2350
+ {
2351
+ "epoch": 0.6740722039888917,
2352
+ "grad_norm": 1.4961283206939697,
2353
+ "learning_rate": 2.199e-05,
2354
+ "loss": 0.0726,
2355
+ "step": 2670
2356
+ },
2357
+ {
2358
+ "epoch": 0.6765968189851048,
2359
+ "grad_norm": 0.7803681492805481,
2360
+ "learning_rate": 2.196e-05,
2361
+ "loss": 0.0669,
2362
+ "step": 2680
2363
+ },
2364
+ {
2365
+ "epoch": 0.6791214339813179,
2366
+ "grad_norm": 1.0371824502944946,
2367
+ "learning_rate": 2.193e-05,
2368
+ "loss": 0.0566,
2369
+ "step": 2690
2370
+ },
2371
+ {
2372
+ "epoch": 0.6816460489775309,
2373
+ "grad_norm": 1.1832714080810547,
2374
+ "learning_rate": 2.19e-05,
2375
+ "loss": 0.067,
2376
+ "step": 2700
2377
+ },
2378
+ {
2379
+ "epoch": 0.6816460489775309,
2380
+ "eval_f1": 0.6132461161079312,
2381
+ "eval_loss": 0.055793602019548416,
2382
+ "eval_runtime": 1161.8914,
2383
+ "eval_samples_per_second": 177.523,
2384
+ "eval_steps_per_second": 2.774,
2385
+ "step": 2700
2386
+ },
2387
+ {
2388
+ "epoch": 0.684170663973744,
2389
+ "grad_norm": 0.7899573445320129,
2390
+ "learning_rate": 2.187e-05,
2391
+ "loss": 0.0763,
2392
+ "step": 2710
2393
+ },
2394
+ {
2395
+ "epoch": 0.6866952789699571,
2396
+ "grad_norm": 1.4638808965682983,
2397
+ "learning_rate": 2.184e-05,
2398
+ "loss": 0.0768,
2399
+ "step": 2720
2400
+ },
2401
+ {
2402
+ "epoch": 0.6892198939661701,
2403
+ "grad_norm": 0.7547538876533508,
2404
+ "learning_rate": 2.181e-05,
2405
+ "loss": 0.0761,
2406
+ "step": 2730
2407
+ },
2408
+ {
2409
+ "epoch": 0.6917445089623833,
2410
+ "grad_norm": 0.5143932700157166,
2411
+ "learning_rate": 2.178e-05,
2412
+ "loss": 0.0808,
2413
+ "step": 2740
2414
+ },
2415
+ {
2416
+ "epoch": 0.6942691239585963,
2417
+ "grad_norm": 1.011730432510376,
2418
+ "learning_rate": 2.175e-05,
2419
+ "loss": 0.068,
2420
+ "step": 2750
2421
+ },
2422
+ {
2423
+ "epoch": 0.6942691239585963,
2424
+ "eval_f1": 0.6108202443280978,
2425
+ "eval_loss": 0.053855251520872116,
2426
+ "eval_runtime": 1160.3338,
2427
+ "eval_samples_per_second": 177.761,
2428
+ "eval_steps_per_second": 2.778,
2429
+ "step": 2750
2430
+ },
2431
+ {
2432
+ "epoch": 0.6942691239585963,
2433
+ "step": 2750,
2434
+ "total_flos": 1.3639932886745088e+19,
2435
+ "train_loss": 0.019194319985129618,
2436
+ "train_runtime": 18605.0451,
2437
+ "train_samples_per_second": 34.399,
2438
+ "train_steps_per_second": 0.537
2439
  }
2440
  ],
2441
  "logging_steps": 10,
 
2464
  "attributes": {}
2465
  }
2466
  },
2467
+ "total_flos": 1.3639932886745088e+19,
2468
  "train_batch_size": 64,
2469
  "trial_name": null,
2470
  "trial_params": null