alhosseini commited on
Commit
a20e64f
1 Parent(s): 1c6ceee

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +68 -0
  2. checkpoint-1000/optimizer_0/.metadata +3 -0
  3. checkpoint-1000/optimizer_0/__0_0.distcp +3 -0
  4. checkpoint-1000/optimizer_0/__1_0.distcp +3 -0
  5. checkpoint-1000/optimizer_0/__2_0.distcp +3 -0
  6. checkpoint-1000/optimizer_0/__3_0.distcp +3 -0
  7. checkpoint-1000/optimizer_0/__4_0.distcp +3 -0
  8. checkpoint-1000/optimizer_0/__5_0.distcp +3 -0
  9. checkpoint-1000/optimizer_0/__6_0.distcp +3 -0
  10. checkpoint-1000/optimizer_0/__7_0.distcp +3 -0
  11. checkpoint-1000/pytorch_model_fsdp_0/.metadata +0 -0
  12. checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp +3 -0
  13. checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp +3 -0
  14. checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp +3 -0
  15. checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp +3 -0
  16. checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp +3 -0
  17. checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp +3 -0
  18. checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp +3 -0
  19. checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp +3 -0
  20. checkpoint-1000/rng_state_0.pth +3 -0
  21. checkpoint-1000/rng_state_1.pth +3 -0
  22. checkpoint-1000/rng_state_2.pth +3 -0
  23. checkpoint-1000/rng_state_3.pth +3 -0
  24. checkpoint-1000/rng_state_4.pth +3 -0
  25. checkpoint-1000/rng_state_5.pth +3 -0
  26. checkpoint-1000/rng_state_6.pth +3 -0
  27. checkpoint-1000/rng_state_7.pth +3 -0
  28. checkpoint-1000/scheduler.pt +3 -0
  29. checkpoint-1000/trainer_state.json +381 -0
  30. checkpoint-250/optimizer_0/.metadata +3 -0
  31. checkpoint-250/optimizer_0/__0_0.distcp +3 -0
  32. checkpoint-250/optimizer_0/__1_0.distcp +3 -0
  33. checkpoint-250/optimizer_0/__2_0.distcp +3 -0
  34. checkpoint-250/optimizer_0/__3_0.distcp +3 -0
  35. checkpoint-250/optimizer_0/__4_0.distcp +3 -0
  36. checkpoint-250/optimizer_0/__5_0.distcp +3 -0
  37. checkpoint-250/optimizer_0/__6_0.distcp +3 -0
  38. checkpoint-250/optimizer_0/__7_0.distcp +3 -0
  39. checkpoint-250/pytorch_model_fsdp_0/.metadata +0 -0
  40. checkpoint-250/pytorch_model_fsdp_0/__0_0.distcp +3 -0
  41. checkpoint-250/pytorch_model_fsdp_0/__1_0.distcp +3 -0
  42. checkpoint-250/pytorch_model_fsdp_0/__2_0.distcp +3 -0
  43. checkpoint-250/pytorch_model_fsdp_0/__3_0.distcp +3 -0
  44. checkpoint-250/pytorch_model_fsdp_0/__4_0.distcp +3 -0
  45. checkpoint-250/pytorch_model_fsdp_0/__5_0.distcp +3 -0
  46. checkpoint-250/pytorch_model_fsdp_0/__6_0.distcp +3 -0
  47. checkpoint-250/pytorch_model_fsdp_0/__7_0.distcp +3 -0
  48. checkpoint-250/rng_state_0.pth +3 -0
  49. checkpoint-250/rng_state_1.pth +3 -0
  50. checkpoint-250/rng_state_2.pth +3 -0
.gitattributes CHANGED
@@ -33,3 +33,71 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-1000/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-1000/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
38
+ checkpoint-1000/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
39
+ checkpoint-1000/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
40
+ checkpoint-1000/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
41
+ checkpoint-1000/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
42
+ checkpoint-1000/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
43
+ checkpoint-1000/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
44
+ checkpoint-1000/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
45
+ checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
48
+ checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
49
+ checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
51
+ checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
52
+ checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
53
+ checkpoint-250/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
54
+ checkpoint-250/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
55
+ checkpoint-250/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
56
+ checkpoint-250/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
57
+ checkpoint-250/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
58
+ checkpoint-250/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
59
+ checkpoint-250/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
60
+ checkpoint-250/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
61
+ checkpoint-250/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
62
+ checkpoint-250/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
63
+ checkpoint-250/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
64
+ checkpoint-250/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
65
+ checkpoint-250/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
66
+ checkpoint-250/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
67
+ checkpoint-250/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
68
+ checkpoint-250/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
69
+ checkpoint-250/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
70
+ checkpoint-500/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
71
+ checkpoint-500/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
72
+ checkpoint-500/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
73
+ checkpoint-500/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
74
+ checkpoint-500/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
75
+ checkpoint-500/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
76
+ checkpoint-500/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
77
+ checkpoint-500/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
78
+ checkpoint-500/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
79
+ checkpoint-500/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
80
+ checkpoint-500/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
81
+ checkpoint-500/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
82
+ checkpoint-500/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
83
+ checkpoint-500/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
84
+ checkpoint-500/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
85
+ checkpoint-500/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
86
+ checkpoint-500/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
87
+ checkpoint-750/optimizer_0/.metadata filter=lfs diff=lfs merge=lfs -text
88
+ checkpoint-750/optimizer_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
89
+ checkpoint-750/optimizer_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
90
+ checkpoint-750/optimizer_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
91
+ checkpoint-750/optimizer_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
92
+ checkpoint-750/optimizer_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
93
+ checkpoint-750/optimizer_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
94
+ checkpoint-750/optimizer_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
95
+ checkpoint-750/optimizer_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
96
+ checkpoint-750/pytorch_model_fsdp_0/__0_0.distcp filter=lfs diff=lfs merge=lfs -text
97
+ checkpoint-750/pytorch_model_fsdp_0/__1_0.distcp filter=lfs diff=lfs merge=lfs -text
98
+ checkpoint-750/pytorch_model_fsdp_0/__2_0.distcp filter=lfs diff=lfs merge=lfs -text
99
+ checkpoint-750/pytorch_model_fsdp_0/__3_0.distcp filter=lfs diff=lfs merge=lfs -text
100
+ checkpoint-750/pytorch_model_fsdp_0/__4_0.distcp filter=lfs diff=lfs merge=lfs -text
101
+ checkpoint-750/pytorch_model_fsdp_0/__5_0.distcp filter=lfs diff=lfs merge=lfs -text
102
+ checkpoint-750/pytorch_model_fsdp_0/__6_0.distcp filter=lfs diff=lfs merge=lfs -text
103
+ checkpoint-750/pytorch_model_fsdp_0/__7_0.distcp filter=lfs diff=lfs merge=lfs -text
checkpoint-1000/optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a07921faeb91b5e7dc24c8f800f35e960259f183f1fb74189978fce8238fa6
3
+ size 1090439
checkpoint-1000/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a7f177789141ca996b0e90a341e39e2de25e91e692e7bc78472f16216b9b2ed9
3
+ size 8031213736
checkpoint-1000/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d79d59514bcb88df1ac25a0cb683cdbc222bf5f9c43a595a718cce42dcb2def2
3
+ size 8030948008
checkpoint-1000/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64a175196b451493422ed2dc99858b2d77e7586b65bcb84ed9dcba617c822054
3
+ size 8030948008
checkpoint-1000/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98b972e9bee455401f97c87120f0003d517be8ec5682ea29ca0c0e700ac9cf7d
3
+ size 8030948008
checkpoint-1000/optimizer_0/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fbb6893976df4d88d90b48a49aebc8f1d55d542adafa18cdf69c24f7fe41bee
3
+ size 8030948008
checkpoint-1000/optimizer_0/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e29028cb17f1abd4b53b4f04bbc74f7248de5df95a2c3d37e70600727058adf
3
+ size 8030948008
checkpoint-1000/optimizer_0/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3db7c61c19ab005c404d6ac71bb33c354e9dfd4e2af619350b875eb3067a3b3
3
+ size 8030948008
checkpoint-1000/optimizer_0/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a19524bc3be3d623d5dd5e17f2f271b019cfafb8ef88c5c86cde78f8f60ec5f
3
+ size 8030948008
checkpoint-1000/pytorch_model_fsdp_0/.metadata ADDED
Binary file (456 kB). View file
 
checkpoint-1000/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2c168d0c1e98da39178db29c2cd84f774a394872d6272a6a3a322e9608ee575
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c385938b4e958ffb4d1a1fba0704adf4a74b007878fcda6c49c43e6ed50c66b6
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b6cd014abd1ed8aaaa13278023ec159e31f02bdaf760d947e1db78965d27f38
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c242acc5eb7fbd12f2f4d8c9e8b8f36b8b1c4372061d3ee4d84f39af720f585
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33763ca9f6987d5ada8ad623b33e20c37cfec0c70464e7f3fc3f41010fe97172
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f6c8447522ea6e1ed4d15bfce77a24f327b180a622be74323874c3a5978b769
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27a02007add9e064fa3e8f61d6e521552f2a41cdff9082fa8421e585392e505a
3
+ size 4015474004
checkpoint-1000/pytorch_model_fsdp_0/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90835f2f41af6f868e3a8bf111ef2fe4c7f76e028db22b7bb68566660d8e3b10
3
+ size 4015474004
checkpoint-1000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f2e52b4ff6d63af0a2316dfdb146ae3aa9823994d4f90b1532b6f7bbec5dcca
3
+ size 14960
checkpoint-1000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7701c1c9415fc3393e58d39af6ac55dcb31801620f895253c16ebf5cb20b5bbe
3
+ size 14960
checkpoint-1000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f81001d5616bc0c5224760c3ce1f52f630fa887a76a3e7770d02a83bcf38dbc0
3
+ size 14960
checkpoint-1000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:572b48ef38fa0ff75e6294c23aea18390b9aa73f011b8dfac9956f5f69e2328f
3
+ size 14960
checkpoint-1000/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:434403b20b0d8a9fa1b8d4d1af4b56ffe7ae4a7532a8099aeec698d4e7125fc4
3
+ size 14960
checkpoint-1000/rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:397f0bc350fc31f88f1987a7e43045e8bc76b6ef407f94011cddf6a0edcb12d0
3
+ size 14960
checkpoint-1000/rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba6d6fe98c76e08aab2a10cfecd2d8a74b059f9683bacb8dc6c3c75a3336e882
3
+ size 14960
checkpoint-1000/rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bea82e2b8f0de62b5067b75169b2e3386b3dacbb75f62ee322ec552172e4c93
3
+ size 14960
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f627496766215c292bc1d6ceecb7b0e07bcc89f3a3d097e9d4c5b8a4241c674f
3
+ size 1064
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 4.201680672268908,
5
+ "eval_steps": 100,
6
+ "global_step": 1000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.10504201680672269,
13
+ "grad_norm": 8.704133033752441,
14
+ "learning_rate": 4.166666666666667e-05,
15
+ "loss": 3.5481,
16
+ "step": 25
17
+ },
18
+ {
19
+ "epoch": 0.21008403361344538,
20
+ "grad_norm": 10.129899978637695,
21
+ "learning_rate": 4.9947570655942796e-05,
22
+ "loss": 3.0062,
23
+ "step": 50
24
+ },
25
+ {
26
+ "epoch": 0.31512605042016806,
27
+ "grad_norm": 6.111865997314453,
28
+ "learning_rate": 4.9734953280908904e-05,
29
+ "loss": 2.9095,
30
+ "step": 75
31
+ },
32
+ {
33
+ "epoch": 0.42016806722689076,
34
+ "grad_norm": 4.366275787353516,
35
+ "learning_rate": 4.936026311617316e-05,
36
+ "loss": 2.9062,
37
+ "step": 100
38
+ },
39
+ {
40
+ "epoch": 0.42016806722689076,
41
+ "eval_loss": 2.872298240661621,
42
+ "eval_runtime": 35.9441,
43
+ "eval_samples_per_second": 13.02,
44
+ "eval_steps_per_second": 1.641,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.5252100840336135,
49
+ "grad_norm": 25.31424903869629,
50
+ "learning_rate": 4.882595527372152e-05,
51
+ "loss": 2.874,
52
+ "step": 125
53
+ },
54
+ {
55
+ "epoch": 0.6302521008403361,
56
+ "grad_norm": 53.229515075683594,
57
+ "learning_rate": 4.813553074106761e-05,
58
+ "loss": 2.887,
59
+ "step": 150
60
+ },
61
+ {
62
+ "epoch": 0.7352941176470589,
63
+ "grad_norm": 7.568065166473389,
64
+ "learning_rate": 4.7293513441455364e-05,
65
+ "loss": 2.8903,
66
+ "step": 175
67
+ },
68
+ {
69
+ "epoch": 0.8403361344537815,
70
+ "grad_norm": 6.394404888153076,
71
+ "learning_rate": 4.630542059139924e-05,
72
+ "loss": 2.7455,
73
+ "step": 200
74
+ },
75
+ {
76
+ "epoch": 0.8403361344537815,
77
+ "eval_loss": 2.743553638458252,
78
+ "eval_runtime": 35.9282,
79
+ "eval_samples_per_second": 13.026,
80
+ "eval_steps_per_second": 1.642,
81
+ "step": 200
82
+ },
83
+ {
84
+ "epoch": 0.9453781512605042,
85
+ "grad_norm": 3.6301393508911133,
86
+ "learning_rate": 4.517772654979023e-05,
87
+ "loss": 2.7609,
88
+ "step": 225
89
+ },
90
+ {
91
+ "epoch": 1.050420168067227,
92
+ "grad_norm": 21.900226593017578,
93
+ "learning_rate": 4.391782039544238e-05,
94
+ "loss": 2.7411,
95
+ "step": 250
96
+ },
97
+ {
98
+ "epoch": 1.1554621848739495,
99
+ "grad_norm": 9.00839900970459,
100
+ "learning_rate": 4.253395751104748e-05,
101
+ "loss": 2.3212,
102
+ "step": 275
103
+ },
104
+ {
105
+ "epoch": 1.2605042016806722,
106
+ "grad_norm": 3.9047327041625977,
107
+ "learning_rate": 4.10352054907785e-05,
108
+ "loss": 2.0506,
109
+ "step": 300
110
+ },
111
+ {
112
+ "epoch": 1.2605042016806722,
113
+ "eval_loss": 2.8556525707244873,
114
+ "eval_runtime": 35.8984,
115
+ "eval_samples_per_second": 13.037,
116
+ "eval_steps_per_second": 1.644,
117
+ "step": 300
118
+ },
119
+ {
120
+ "epoch": 1.365546218487395,
121
+ "grad_norm": 3.455016613006592,
122
+ "learning_rate": 3.943138472597549e-05,
123
+ "loss": 1.826,
124
+ "step": 325
125
+ },
126
+ {
127
+ "epoch": 1.4705882352941178,
128
+ "grad_norm": 10.288714408874512,
129
+ "learning_rate": 3.773300405821908e-05,
130
+ "loss": 1.8779,
131
+ "step": 350
132
+ },
133
+ {
134
+ "epoch": 1.5756302521008403,
135
+ "grad_norm": 2.957385540008545,
136
+ "learning_rate": 3.595119192141706e-05,
137
+ "loss": 1.85,
138
+ "step": 375
139
+ },
140
+ {
141
+ "epoch": 1.680672268907563,
142
+ "grad_norm": 3.40120267868042,
143
+ "learning_rate": 3.409762342408719e-05,
144
+ "loss": 1.835,
145
+ "step": 400
146
+ },
147
+ {
148
+ "epoch": 1.680672268907563,
149
+ "eval_loss": 2.7827320098876953,
150
+ "eval_runtime": 36.0584,
151
+ "eval_samples_per_second": 12.979,
152
+ "eval_steps_per_second": 1.636,
153
+ "step": 400
154
+ },
155
+ {
156
+ "epoch": 1.7857142857142856,
157
+ "grad_norm": 3.446842670440674,
158
+ "learning_rate": 3.218444384962071e-05,
159
+ "loss": 1.8477,
160
+ "step": 425
161
+ },
162
+ {
163
+ "epoch": 1.8907563025210083,
164
+ "grad_norm": 3.1697998046875,
165
+ "learning_rate": 3.0224189075781884e-05,
166
+ "loss": 1.7934,
167
+ "step": 450
168
+ },
169
+ {
170
+ "epoch": 1.995798319327731,
171
+ "grad_norm": 2.8750455379486084,
172
+ "learning_rate": 2.8229703434885163e-05,
173
+ "loss": 1.7526,
174
+ "step": 475
175
+ },
176
+ {
177
+ "epoch": 2.100840336134454,
178
+ "grad_norm": 2.977962017059326,
179
+ "learning_rate": 2.621405555286121e-05,
180
+ "loss": 0.8246,
181
+ "step": 500
182
+ },
183
+ {
184
+ "epoch": 2.100840336134454,
185
+ "eval_loss": 3.001242160797119,
186
+ "eval_runtime": 35.8998,
187
+ "eval_samples_per_second": 13.036,
188
+ "eval_steps_per_second": 1.643,
189
+ "step": 500
190
+ },
191
+ {
192
+ "epoch": 2.2058823529411766,
193
+ "grad_norm": 2.8460147380828857,
194
+ "learning_rate": 2.419045271866611e-05,
195
+ "loss": 0.7854,
196
+ "step": 525
197
+ },
198
+ {
199
+ "epoch": 2.310924369747899,
200
+ "grad_norm": 3.5654945373535156,
201
+ "learning_rate": 2.2172154345117894e-05,
202
+ "loss": 0.8065,
203
+ "step": 550
204
+ },
205
+ {
206
+ "epoch": 2.4159663865546217,
207
+ "grad_norm": 2.3066728115081787,
208
+ "learning_rate": 2.0172385088197803e-05,
209
+ "loss": 0.7753,
210
+ "step": 575
211
+ },
212
+ {
213
+ "epoch": 2.5210084033613445,
214
+ "grad_norm": 2.5332283973693848,
215
+ "learning_rate": 1.820424819409143e-05,
216
+ "loss": 0.824,
217
+ "step": 600
218
+ },
219
+ {
220
+ "epoch": 2.5210084033613445,
221
+ "eval_loss": 2.8449320793151855,
222
+ "eval_runtime": 35.8956,
223
+ "eval_samples_per_second": 13.038,
224
+ "eval_steps_per_second": 1.644,
225
+ "step": 600
226
+ },
227
+ {
228
+ "epoch": 2.6260504201680672,
229
+ "grad_norm": 3.0348715782165527,
230
+ "learning_rate": 1.6280639641752942e-05,
231
+ "loss": 0.7481,
232
+ "step": 625
233
+ },
234
+ {
235
+ "epoch": 2.73109243697479,
236
+ "grad_norm": 2.7579290866851807,
237
+ "learning_rate": 1.4414163643562755e-05,
238
+ "loss": 0.7543,
239
+ "step": 650
240
+ },
241
+ {
242
+ "epoch": 2.8361344537815127,
243
+ "grad_norm": 2.145768404006958,
244
+ "learning_rate": 1.2617050057750322e-05,
245
+ "loss": 0.753,
246
+ "step": 675
247
+ },
248
+ {
249
+ "epoch": 2.9411764705882355,
250
+ "grad_norm": 2.6759989261627197,
251
+ "learning_rate": 1.0901074253727336e-05,
252
+ "loss": 0.7418,
253
+ "step": 700
254
+ },
255
+ {
256
+ "epoch": 2.9411764705882355,
257
+ "eval_loss": 2.796262741088867,
258
+ "eval_runtime": 35.8962,
259
+ "eval_samples_per_second": 13.038,
260
+ "eval_steps_per_second": 1.644,
261
+ "step": 700
262
+ },
263
+ {
264
+ "epoch": 3.046218487394958,
265
+ "grad_norm": 1.9286493062973022,
266
+ "learning_rate": 9.277479955403887e-06,
267
+ "loss": 0.5768,
268
+ "step": 725
269
+ },
270
+ {
271
+ "epoch": 3.1512605042016806,
272
+ "grad_norm": 1.2430107593536377,
273
+ "learning_rate": 7.756905568047393e-06,
274
+ "loss": 0.2213,
275
+ "step": 750
276
+ },
277
+ {
278
+ "epoch": 3.2563025210084033,
279
+ "grad_norm": 1.5327359437942505,
280
+ "learning_rate": 6.349314471418849e-06,
281
+ "loss": 0.2563,
282
+ "step": 775
283
+ },
284
+ {
285
+ "epoch": 3.361344537815126,
286
+ "grad_norm": 1.448890209197998,
287
+ "learning_rate": 5.063929735931985e-06,
288
+ "loss": 0.2227,
289
+ "step": 800
290
+ },
291
+ {
292
+ "epoch": 3.361344537815126,
293
+ "eval_loss": 3.311384439468384,
294
+ "eval_runtime": 35.9006,
295
+ "eval_samples_per_second": 13.036,
296
+ "eval_steps_per_second": 1.643,
297
+ "step": 800
298
+ },
299
+ {
300
+ "epoch": 3.466386554621849,
301
+ "grad_norm": 1.3534725904464722,
302
+ "learning_rate": 3.90917368959989e-06,
303
+ "loss": 0.2464,
304
+ "step": 825
305
+ },
306
+ {
307
+ "epoch": 3.571428571428571,
308
+ "grad_norm": 1.3151746988296509,
309
+ "learning_rate": 2.892612731749414e-06,
310
+ "loss": 0.2439,
311
+ "step": 850
312
+ },
313
+ {
314
+ "epoch": 3.6764705882352944,
315
+ "grad_norm": 1.4781558513641357,
316
+ "learning_rate": 2.020907755104698e-06,
317
+ "loss": 0.2355,
318
+ "step": 875
319
+ },
320
+ {
321
+ "epoch": 3.7815126050420167,
322
+ "grad_norm": 1.4184162616729736,
323
+ "learning_rate": 1.2997705010932393e-06,
324
+ "loss": 0.2198,
325
+ "step": 900
326
+ },
327
+ {
328
+ "epoch": 3.7815126050420167,
329
+ "eval_loss": 3.312718152999878,
330
+ "eval_runtime": 35.8966,
331
+ "eval_samples_per_second": 13.037,
332
+ "eval_steps_per_second": 1.644,
333
+ "step": 900
334
+ },
335
+ {
336
+ "epoch": 3.8865546218487395,
337
+ "grad_norm": 1.545154333114624,
338
+ "learning_rate": 7.339261343510206e-07,
339
+ "loss": 0.222,
340
+ "step": 925
341
+ },
342
+ {
343
+ "epoch": 3.991596638655462,
344
+ "grad_norm": 1.5649031400680542,
345
+ "learning_rate": 3.270822816527325e-07,
346
+ "loss": 0.2069,
347
+ "step": 950
348
+ },
349
+ {
350
+ "epoch": 4.0966386554621845,
351
+ "grad_norm": 0.9737259149551392,
352
+ "learning_rate": 8.190473813576572e-08,
353
+ "loss": 0.1163,
354
+ "step": 975
355
+ },
356
+ {
357
+ "epoch": 4.201680672268908,
358
+ "grad_norm": 0.9735974669456482,
359
+ "learning_rate": 0.0,
360
+ "loss": 0.1478,
361
+ "step": 1000
362
+ },
363
+ {
364
+ "epoch": 4.201680672268908,
365
+ "eval_loss": 3.372861385345459,
366
+ "eval_runtime": 35.8981,
367
+ "eval_samples_per_second": 13.037,
368
+ "eval_steps_per_second": 1.644,
369
+ "step": 1000
370
+ }
371
+ ],
372
+ "logging_steps": 25,
373
+ "max_steps": 1000,
374
+ "num_input_tokens_seen": 0,
375
+ "num_train_epochs": 5,
376
+ "save_steps": 250,
377
+ "total_flos": 9.22205177249792e+16,
378
+ "train_batch_size": 1,
379
+ "trial_name": null,
380
+ "trial_params": null
381
+ }
checkpoint-250/optimizer_0/.metadata ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a07921faeb91b5e7dc24c8f800f35e960259f183f1fb74189978fce8238fa6
3
+ size 1090439
checkpoint-250/optimizer_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cfd271f3dee1a625ddae0dc9979b3488ff3b4c52d49c395600c818a5272e397
3
+ size 8031213736
checkpoint-250/optimizer_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:89f5bb8dcc5e461163c2f5c78ed9806bdded86f52057b6c9cf47f2296db1bfaa
3
+ size 8030948008
checkpoint-250/optimizer_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b212f9e1e8402eb564cc0b72eb224d507437e32bfd2207199836b9856f4158f
3
+ size 8030948008
checkpoint-250/optimizer_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:834495a2cdadbe24135ed59c8f32a7314d66758d5e1d8fbfa1da88774b031b72
3
+ size 8030948008
checkpoint-250/optimizer_0/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8e99a458df63f3a569ac0737ee401ca59725efeb9fd7cf23560271759553d5b
3
+ size 8030948008
checkpoint-250/optimizer_0/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3455e842b33fd85391b1cb135d011a7a200b922e33c9b20b5ff24fc13c88a914
3
+ size 8030948008
checkpoint-250/optimizer_0/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73a0ec7ea24a0d0384ee39f363d9f7c2207190de2bc728875e11a620562ce008
3
+ size 8030948008
checkpoint-250/optimizer_0/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7544855e1e3089cb67dbfce67f657857a6e58c5563e68f1b9024ed6f03048088
3
+ size 8030948008
checkpoint-250/pytorch_model_fsdp_0/.metadata ADDED
Binary file (456 kB). View file
 
checkpoint-250/pytorch_model_fsdp_0/__0_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad4e190657e60a65f0c9dc8730e20f624811f7ce93e1d0f471e92893191bd6c5
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__1_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca42926a244a70554e7f4fdb59660bb1127500225a4bfc7e922775800a50d47d
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__2_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d7ecd6017a2dd3c9a01ba52aa0a1ebd16fa417a706aff8a2e1f38dc9a328eb2
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__3_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ad4b6d3724edcf396402b673a1697c8728d83b333079df230dd4a1bfd0a5793
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__4_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b936fd437858ee02804a3e57092575c794967cf353b441a24c386c644e11a665
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__5_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40cd4e340acbce57aa08fc6f8c5dd33f27be417156a7b0041801436ddfdfc246
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__6_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f193a686bed3a2589f2963a17e53477f1d9f8c785e80edaeed3a0d08667af2ab
3
+ size 4015474004
checkpoint-250/pytorch_model_fsdp_0/__7_0.distcp ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:403856e7121e448ee4fcd1a5fdbeac5ff6f40d6ac564cab9ee7cedaeac75b229
3
+ size 4015474004
checkpoint-250/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5646d8ccb8e918de34f25b0c51a9c4fb696f5429a47506649c9badd8bf3bcfe1
3
+ size 14960
checkpoint-250/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d94c7abd3510f6bbf1a2ac0a285c77356db76debe7ce90119c9cb896dd03b12d
3
+ size 14960
checkpoint-250/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b31a8737e95e91b14308f4d491da2ab52a884518f361e0d3ab328cb7fad81728
3
+ size 14960