JorgeDeC commited on
Commit
b81918b
1 Parent(s): 9e4ef1d

Upload folder using huggingface_hub

Browse files
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6adbd6caf7bdf1842e91bc5816ccdd5448347927512e466c6ffec9628b45e389
3
  size 83946192
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9321de038fa477f451b99b972c97bcca817d50a26abcb6a3ec792884fc053134
3
  size 83946192
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf81dd98ec9c616d55d1232f7ada73dc21309fa8c33dc3f10a7dbdc17291a944
3
  size 168150290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c06c56f6710224e06886df2be739ebd17dffc7734c4e7d8b98fe55f2f0b0b0b
3
  size 168150290
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7ef525735c3cf9ca20902b64f335b1cb98298205e4df0c6e14b2c5e5e1d7d8dd
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9413f09c97ddf79cd5b9d44689ea00172f41705aa2b8ecd3992cf19e3e41da70
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6ff283720f8f76393837f434b89763e59f3450d2b659f4e3ad09ffbdff910d9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c808ddfdceb87a96529f08bc0d86714d5079d0321d17fd879e7f27bad8778244
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.5048630187838741,
5
  "eval_steps": 500,
6
- "global_step": 3400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -4774,6 +4774,846 @@
4774
  "learning_rate": 0.00011569464760810825,
4775
  "loss": 0.9558,
4776
  "step": 3400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4777
  }
4778
  ],
4779
  "logging_steps": 5,
@@ -4781,7 +5621,7 @@
4781
  "num_input_tokens_seen": 0,
4782
  "num_train_epochs": 1,
4783
  "save_steps": 100,
4784
- "total_flos": 4.781280773300814e+18,
4785
  "train_batch_size": 2,
4786
  "trial_name": null,
4787
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5939564926869106,
5
  "eval_steps": 500,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
4774
  "learning_rate": 0.00011569464760810825,
4775
  "loss": 0.9558,
4776
  "step": 3400
4777
+ },
4778
+ {
4779
+ "epoch": 0.51,
4780
+ "grad_norm": 0.345703125,
4781
+ "learning_rate": 0.00011543860018295966,
4782
+ "loss": 0.9002,
4783
+ "step": 3405
4784
+ },
4785
+ {
4786
+ "epoch": 0.51,
4787
+ "grad_norm": 0.34375,
4788
+ "learning_rate": 0.00011518244902833537,
4789
+ "loss": 0.9107,
4790
+ "step": 3410
4791
+ },
4792
+ {
4793
+ "epoch": 0.51,
4794
+ "grad_norm": 0.34375,
4795
+ "learning_rate": 0.00011492619586527385,
4796
+ "loss": 0.9712,
4797
+ "step": 3415
4798
+ },
4799
+ {
4800
+ "epoch": 0.51,
4801
+ "grad_norm": 0.357421875,
4802
+ "learning_rate": 0.0001146698424154989,
4803
+ "loss": 0.9171,
4804
+ "step": 3420
4805
+ },
4806
+ {
4807
+ "epoch": 0.51,
4808
+ "grad_norm": 0.33984375,
4809
+ "learning_rate": 0.00011441339040140824,
4810
+ "loss": 0.9107,
4811
+ "step": 3425
4812
+ },
4813
+ {
4814
+ "epoch": 0.51,
4815
+ "grad_norm": 0.337890625,
4816
+ "learning_rate": 0.00011415684154606177,
4817
+ "loss": 0.9396,
4818
+ "step": 3430
4819
+ },
4820
+ {
4821
+ "epoch": 0.51,
4822
+ "grad_norm": 0.341796875,
4823
+ "learning_rate": 0.00011390019757317003,
4824
+ "loss": 0.9101,
4825
+ "step": 3435
4826
+ },
4827
+ {
4828
+ "epoch": 0.51,
4829
+ "grad_norm": 0.345703125,
4830
+ "learning_rate": 0.00011364346020708266,
4831
+ "loss": 0.9324,
4832
+ "step": 3440
4833
+ },
4834
+ {
4835
+ "epoch": 0.51,
4836
+ "grad_norm": 0.341796875,
4837
+ "learning_rate": 0.00011338663117277686,
4838
+ "loss": 0.933,
4839
+ "step": 3445
4840
+ },
4841
+ {
4842
+ "epoch": 0.51,
4843
+ "grad_norm": 0.34765625,
4844
+ "learning_rate": 0.00011312971219584563,
4845
+ "loss": 0.9153,
4846
+ "step": 3450
4847
+ },
4848
+ {
4849
+ "epoch": 0.51,
4850
+ "grad_norm": 0.345703125,
4851
+ "learning_rate": 0.00011287270500248631,
4852
+ "loss": 0.9401,
4853
+ "step": 3455
4854
+ },
4855
+ {
4856
+ "epoch": 0.51,
4857
+ "grad_norm": 0.353515625,
4858
+ "learning_rate": 0.00011261561131948897,
4859
+ "loss": 0.9264,
4860
+ "step": 3460
4861
+ },
4862
+ {
4863
+ "epoch": 0.51,
4864
+ "grad_norm": 0.34375,
4865
+ "learning_rate": 0.00011235843287422482,
4866
+ "loss": 0.92,
4867
+ "step": 3465
4868
+ },
4869
+ {
4870
+ "epoch": 0.52,
4871
+ "grad_norm": 0.361328125,
4872
+ "learning_rate": 0.00011210117139463452,
4873
+ "loss": 0.9187,
4874
+ "step": 3470
4875
+ },
4876
+ {
4877
+ "epoch": 0.52,
4878
+ "grad_norm": 0.33984375,
4879
+ "learning_rate": 0.00011184382860921663,
4880
+ "loss": 0.949,
4881
+ "step": 3475
4882
+ },
4883
+ {
4884
+ "epoch": 0.52,
4885
+ "grad_norm": 0.341796875,
4886
+ "learning_rate": 0.00011158640624701603,
4887
+ "loss": 0.9201,
4888
+ "step": 3480
4889
+ },
4890
+ {
4891
+ "epoch": 0.52,
4892
+ "grad_norm": 0.33984375,
4893
+ "learning_rate": 0.00011132890603761221,
4894
+ "loss": 0.9096,
4895
+ "step": 3485
4896
+ },
4897
+ {
4898
+ "epoch": 0.52,
4899
+ "grad_norm": 0.353515625,
4900
+ "learning_rate": 0.00011107132971110779,
4901
+ "loss": 0.9248,
4902
+ "step": 3490
4903
+ },
4904
+ {
4905
+ "epoch": 0.52,
4906
+ "grad_norm": 0.349609375,
4907
+ "learning_rate": 0.00011081367899811668,
4908
+ "loss": 0.9189,
4909
+ "step": 3495
4910
+ },
4911
+ {
4912
+ "epoch": 0.52,
4913
+ "grad_norm": 0.33984375,
4914
+ "learning_rate": 0.00011055595562975267,
4915
+ "loss": 0.9182,
4916
+ "step": 3500
4917
+ },
4918
+ {
4919
+ "epoch": 0.52,
4920
+ "grad_norm": 0.365234375,
4921
+ "learning_rate": 0.00011029816133761772,
4922
+ "loss": 0.9549,
4923
+ "step": 3505
4924
+ },
4925
+ {
4926
+ "epoch": 0.52,
4927
+ "grad_norm": 0.341796875,
4928
+ "learning_rate": 0.00011004029785379024,
4929
+ "loss": 0.9158,
4930
+ "step": 3510
4931
+ },
4932
+ {
4933
+ "epoch": 0.52,
4934
+ "grad_norm": 0.33203125,
4935
+ "learning_rate": 0.00010978236691081365,
4936
+ "loss": 0.9104,
4937
+ "step": 3515
4938
+ },
4939
+ {
4940
+ "epoch": 0.52,
4941
+ "grad_norm": 0.3515625,
4942
+ "learning_rate": 0.00010952437024168444,
4943
+ "loss": 0.9305,
4944
+ "step": 3520
4945
+ },
4946
+ {
4947
+ "epoch": 0.52,
4948
+ "grad_norm": 0.349609375,
4949
+ "learning_rate": 0.00010926630957984087,
4950
+ "loss": 0.9383,
4951
+ "step": 3525
4952
+ },
4953
+ {
4954
+ "epoch": 0.52,
4955
+ "grad_norm": 0.34765625,
4956
+ "learning_rate": 0.00010900818665915109,
4957
+ "loss": 0.9124,
4958
+ "step": 3530
4959
+ },
4960
+ {
4961
+ "epoch": 0.52,
4962
+ "grad_norm": 0.341796875,
4963
+ "learning_rate": 0.00010875000321390154,
4964
+ "loss": 0.9169,
4965
+ "step": 3535
4966
+ },
4967
+ {
4968
+ "epoch": 0.53,
4969
+ "grad_norm": 0.34375,
4970
+ "learning_rate": 0.00010849176097878535,
4971
+ "loss": 0.9044,
4972
+ "step": 3540
4973
+ },
4974
+ {
4975
+ "epoch": 0.53,
4976
+ "grad_norm": 0.359375,
4977
+ "learning_rate": 0.00010823346168889062,
4978
+ "loss": 0.9234,
4979
+ "step": 3545
4980
+ },
4981
+ {
4982
+ "epoch": 0.53,
4983
+ "grad_norm": 0.34765625,
4984
+ "learning_rate": 0.00010797510707968878,
4985
+ "loss": 0.9431,
4986
+ "step": 3550
4987
+ },
4988
+ {
4989
+ "epoch": 0.53,
4990
+ "grad_norm": 0.345703125,
4991
+ "learning_rate": 0.00010771669888702303,
4992
+ "loss": 0.9263,
4993
+ "step": 3555
4994
+ },
4995
+ {
4996
+ "epoch": 0.53,
4997
+ "grad_norm": 0.34765625,
4998
+ "learning_rate": 0.00010745823884709647,
4999
+ "loss": 0.9423,
5000
+ "step": 3560
5001
+ },
5002
+ {
5003
+ "epoch": 0.53,
5004
+ "grad_norm": 0.3515625,
5005
+ "learning_rate": 0.00010719972869646062,
5006
+ "loss": 0.9232,
5007
+ "step": 3565
5008
+ },
5009
+ {
5010
+ "epoch": 0.53,
5011
+ "grad_norm": 0.35546875,
5012
+ "learning_rate": 0.00010694117017200372,
5013
+ "loss": 0.962,
5014
+ "step": 3570
5015
+ },
5016
+ {
5017
+ "epoch": 0.53,
5018
+ "grad_norm": 0.33203125,
5019
+ "learning_rate": 0.00010668256501093892,
5020
+ "loss": 0.935,
5021
+ "step": 3575
5022
+ },
5023
+ {
5024
+ "epoch": 0.53,
5025
+ "grad_norm": 0.35546875,
5026
+ "learning_rate": 0.00010642391495079278,
5027
+ "loss": 0.9212,
5028
+ "step": 3580
5029
+ },
5030
+ {
5031
+ "epoch": 0.53,
5032
+ "grad_norm": 0.353515625,
5033
+ "learning_rate": 0.00010616522172939356,
5034
+ "loss": 0.9269,
5035
+ "step": 3585
5036
+ },
5037
+ {
5038
+ "epoch": 0.53,
5039
+ "grad_norm": 0.349609375,
5040
+ "learning_rate": 0.00010590648708485946,
5041
+ "loss": 0.9182,
5042
+ "step": 3590
5043
+ },
5044
+ {
5045
+ "epoch": 0.53,
5046
+ "grad_norm": 0.345703125,
5047
+ "learning_rate": 0.000105647712755587,
5048
+ "loss": 0.9437,
5049
+ "step": 3595
5050
+ },
5051
+ {
5052
+ "epoch": 0.53,
5053
+ "grad_norm": 0.353515625,
5054
+ "learning_rate": 0.00010538890048023937,
5055
+ "loss": 0.9449,
5056
+ "step": 3600
5057
+ },
5058
+ {
5059
+ "epoch": 0.54,
5060
+ "grad_norm": 0.353515625,
5061
+ "learning_rate": 0.0001051300519977347,
5062
+ "loss": 0.9023,
5063
+ "step": 3605
5064
+ },
5065
+ {
5066
+ "epoch": 0.54,
5067
+ "grad_norm": 0.349609375,
5068
+ "learning_rate": 0.00010487116904723433,
5069
+ "loss": 0.9136,
5070
+ "step": 3610
5071
+ },
5072
+ {
5073
+ "epoch": 0.54,
5074
+ "grad_norm": 0.33984375,
5075
+ "learning_rate": 0.00010461225336813128,
5076
+ "loss": 0.9317,
5077
+ "step": 3615
5078
+ },
5079
+ {
5080
+ "epoch": 0.54,
5081
+ "grad_norm": 0.337890625,
5082
+ "learning_rate": 0.00010435330670003842,
5083
+ "loss": 0.8979,
5084
+ "step": 3620
5085
+ },
5086
+ {
5087
+ "epoch": 0.54,
5088
+ "grad_norm": 0.34375,
5089
+ "learning_rate": 0.00010409433078277684,
5090
+ "loss": 0.9319,
5091
+ "step": 3625
5092
+ },
5093
+ {
5094
+ "epoch": 0.54,
5095
+ "grad_norm": 0.34375,
5096
+ "learning_rate": 0.00010383532735636411,
5097
+ "loss": 0.9344,
5098
+ "step": 3630
5099
+ },
5100
+ {
5101
+ "epoch": 0.54,
5102
+ "grad_norm": 0.34765625,
5103
+ "learning_rate": 0.00010357629816100272,
5104
+ "loss": 0.907,
5105
+ "step": 3635
5106
+ },
5107
+ {
5108
+ "epoch": 0.54,
5109
+ "grad_norm": 0.345703125,
5110
+ "learning_rate": 0.0001033172449370682,
5111
+ "loss": 0.9222,
5112
+ "step": 3640
5113
+ },
5114
+ {
5115
+ "epoch": 0.54,
5116
+ "grad_norm": 0.345703125,
5117
+ "learning_rate": 0.00010305816942509761,
5118
+ "loss": 0.9384,
5119
+ "step": 3645
5120
+ },
5121
+ {
5122
+ "epoch": 0.54,
5123
+ "grad_norm": 0.353515625,
5124
+ "learning_rate": 0.00010279907336577765,
5125
+ "loss": 0.9195,
5126
+ "step": 3650
5127
+ },
5128
+ {
5129
+ "epoch": 0.54,
5130
+ "grad_norm": 0.34765625,
5131
+ "learning_rate": 0.00010253995849993321,
5132
+ "loss": 0.9177,
5133
+ "step": 3655
5134
+ },
5135
+ {
5136
+ "epoch": 0.54,
5137
+ "grad_norm": 0.353515625,
5138
+ "learning_rate": 0.0001022808265685154,
5139
+ "loss": 0.9058,
5140
+ "step": 3660
5141
+ },
5142
+ {
5143
+ "epoch": 0.54,
5144
+ "grad_norm": 0.353515625,
5145
+ "learning_rate": 0.0001020216793125901,
5146
+ "loss": 0.9293,
5147
+ "step": 3665
5148
+ },
5149
+ {
5150
+ "epoch": 0.54,
5151
+ "grad_norm": 0.34765625,
5152
+ "learning_rate": 0.00010176251847332614,
5153
+ "loss": 0.8824,
5154
+ "step": 3670
5155
+ },
5156
+ {
5157
+ "epoch": 0.55,
5158
+ "grad_norm": 0.33984375,
5159
+ "learning_rate": 0.00010150334579198353,
5160
+ "loss": 0.9316,
5161
+ "step": 3675
5162
+ },
5163
+ {
5164
+ "epoch": 0.55,
5165
+ "grad_norm": 0.3515625,
5166
+ "learning_rate": 0.00010124416300990196,
5167
+ "loss": 0.9351,
5168
+ "step": 3680
5169
+ },
5170
+ {
5171
+ "epoch": 0.55,
5172
+ "grad_norm": 0.349609375,
5173
+ "learning_rate": 0.00010098497186848888,
5174
+ "loss": 0.9187,
5175
+ "step": 3685
5176
+ },
5177
+ {
5178
+ "epoch": 0.55,
5179
+ "grad_norm": 0.349609375,
5180
+ "learning_rate": 0.00010072577410920794,
5181
+ "loss": 0.9019,
5182
+ "step": 3690
5183
+ },
5184
+ {
5185
+ "epoch": 0.55,
5186
+ "grad_norm": 0.361328125,
5187
+ "learning_rate": 0.00010046657147356733,
5188
+ "loss": 0.9152,
5189
+ "step": 3695
5190
+ },
5191
+ {
5192
+ "epoch": 0.55,
5193
+ "grad_norm": 0.34375,
5194
+ "learning_rate": 0.00010020736570310789,
5195
+ "loss": 0.904,
5196
+ "step": 3700
5197
+ },
5198
+ {
5199
+ "epoch": 0.55,
5200
+ "grad_norm": 0.345703125,
5201
+ "learning_rate": 9.99481585393916e-05,
5202
+ "loss": 0.9462,
5203
+ "step": 3705
5204
+ },
5205
+ {
5206
+ "epoch": 0.55,
5207
+ "grad_norm": 0.34765625,
5208
+ "learning_rate": 9.968895172398974e-05,
5209
+ "loss": 0.9428,
5210
+ "step": 3710
5211
+ },
5212
+ {
5213
+ "epoch": 0.55,
5214
+ "grad_norm": 0.34375,
5215
+ "learning_rate": 9.94297469984713e-05,
5216
+ "loss": 0.9453,
5217
+ "step": 3715
5218
+ },
5219
+ {
5220
+ "epoch": 0.55,
5221
+ "grad_norm": 0.34765625,
5222
+ "learning_rate": 9.917054610439124e-05,
5223
+ "loss": 0.9176,
5224
+ "step": 3720
5225
+ },
5226
+ {
5227
+ "epoch": 0.55,
5228
+ "grad_norm": 0.345703125,
5229
+ "learning_rate": 9.89113507832787e-05,
5230
+ "loss": 0.951,
5231
+ "step": 3725
5232
+ },
5233
+ {
5234
+ "epoch": 0.55,
5235
+ "grad_norm": 0.337890625,
5236
+ "learning_rate": 9.865216277662545e-05,
5237
+ "loss": 0.8904,
5238
+ "step": 3730
5239
+ },
5240
+ {
5241
+ "epoch": 0.55,
5242
+ "grad_norm": 0.3515625,
5243
+ "learning_rate": 9.83929838258741e-05,
5244
+ "loss": 0.9197,
5245
+ "step": 3735
5246
+ },
5247
+ {
5248
+ "epoch": 0.56,
5249
+ "grad_norm": 0.341796875,
5250
+ "learning_rate": 9.813381567240639e-05,
5251
+ "loss": 0.9342,
5252
+ "step": 3740
5253
+ },
5254
+ {
5255
+ "epoch": 0.56,
5256
+ "grad_norm": 0.361328125,
5257
+ "learning_rate": 9.787466005753152e-05,
5258
+ "loss": 0.9713,
5259
+ "step": 3745
5260
+ },
5261
+ {
5262
+ "epoch": 0.56,
5263
+ "grad_norm": 0.3359375,
5264
+ "learning_rate": 9.761551872247449e-05,
5265
+ "loss": 0.9556,
5266
+ "step": 3750
5267
+ },
5268
+ {
5269
+ "epoch": 0.56,
5270
+ "grad_norm": 0.35546875,
5271
+ "learning_rate": 9.735639340836428e-05,
5272
+ "loss": 0.9125,
5273
+ "step": 3755
5274
+ },
5275
+ {
5276
+ "epoch": 0.56,
5277
+ "grad_norm": 0.3515625,
5278
+ "learning_rate": 9.709728585622229e-05,
5279
+ "loss": 0.9716,
5280
+ "step": 3760
5281
+ },
5282
+ {
5283
+ "epoch": 0.56,
5284
+ "grad_norm": 0.345703125,
5285
+ "learning_rate": 9.68381978069506e-05,
5286
+ "loss": 0.9105,
5287
+ "step": 3765
5288
+ },
5289
+ {
5290
+ "epoch": 0.56,
5291
+ "grad_norm": 0.341796875,
5292
+ "learning_rate": 9.657913100132011e-05,
5293
+ "loss": 0.8839,
5294
+ "step": 3770
5295
+ },
5296
+ {
5297
+ "epoch": 0.56,
5298
+ "grad_norm": 0.359375,
5299
+ "learning_rate": 9.632008717995916e-05,
5300
+ "loss": 0.9204,
5301
+ "step": 3775
5302
+ },
5303
+ {
5304
+ "epoch": 0.56,
5305
+ "grad_norm": 0.337890625,
5306
+ "learning_rate": 9.606106808334165e-05,
5307
+ "loss": 0.8863,
5308
+ "step": 3780
5309
+ },
5310
+ {
5311
+ "epoch": 0.56,
5312
+ "grad_norm": 0.328125,
5313
+ "learning_rate": 9.580207545177516e-05,
5314
+ "loss": 0.905,
5315
+ "step": 3785
5316
+ },
5317
+ {
5318
+ "epoch": 0.56,
5319
+ "grad_norm": 0.32421875,
5320
+ "learning_rate": 9.554311102538966e-05,
5321
+ "loss": 0.9207,
5322
+ "step": 3790
5323
+ },
5324
+ {
5325
+ "epoch": 0.56,
5326
+ "grad_norm": 0.33984375,
5327
+ "learning_rate": 9.528417654412564e-05,
5328
+ "loss": 0.9598,
5329
+ "step": 3795
5330
+ },
5331
+ {
5332
+ "epoch": 0.56,
5333
+ "grad_norm": 0.359375,
5334
+ "learning_rate": 9.502527374772217e-05,
5335
+ "loss": 0.943,
5336
+ "step": 3800
5337
+ },
5338
+ {
5339
+ "epoch": 0.57,
5340
+ "grad_norm": 0.35546875,
5341
+ "learning_rate": 9.476640437570562e-05,
5342
+ "loss": 0.9201,
5343
+ "step": 3805
5344
+ },
5345
+ {
5346
+ "epoch": 0.57,
5347
+ "grad_norm": 0.34765625,
5348
+ "learning_rate": 9.450757016737776e-05,
5349
+ "loss": 0.9366,
5350
+ "step": 3810
5351
+ },
5352
+ {
5353
+ "epoch": 0.57,
5354
+ "grad_norm": 0.345703125,
5355
+ "learning_rate": 9.424877286180404e-05,
5356
+ "loss": 0.9357,
5357
+ "step": 3815
5358
+ },
5359
+ {
5360
+ "epoch": 0.57,
5361
+ "grad_norm": 0.34765625,
5362
+ "learning_rate": 9.3990014197802e-05,
5363
+ "loss": 0.9431,
5364
+ "step": 3820
5365
+ },
5366
+ {
5367
+ "epoch": 0.57,
5368
+ "grad_norm": 0.34765625,
5369
+ "learning_rate": 9.37312959139296e-05,
5370
+ "loss": 0.9362,
5371
+ "step": 3825
5372
+ },
5373
+ {
5374
+ "epoch": 0.57,
5375
+ "grad_norm": 0.34375,
5376
+ "learning_rate": 9.347261974847341e-05,
5377
+ "loss": 0.9157,
5378
+ "step": 3830
5379
+ },
5380
+ {
5381
+ "epoch": 0.57,
5382
+ "grad_norm": 0.34375,
5383
+ "learning_rate": 9.321398743943706e-05,
5384
+ "loss": 0.9213,
5385
+ "step": 3835
5386
+ },
5387
+ {
5388
+ "epoch": 0.57,
5389
+ "grad_norm": 0.3359375,
5390
+ "learning_rate": 9.295540072452951e-05,
5391
+ "loss": 0.9502,
5392
+ "step": 3840
5393
+ },
5394
+ {
5395
+ "epoch": 0.57,
5396
+ "grad_norm": 0.337890625,
5397
+ "learning_rate": 9.269686134115336e-05,
5398
+ "loss": 0.9224,
5399
+ "step": 3845
5400
+ },
5401
+ {
5402
+ "epoch": 0.57,
5403
+ "grad_norm": 0.33984375,
5404
+ "learning_rate": 9.243837102639328e-05,
5405
+ "loss": 0.8954,
5406
+ "step": 3850
5407
+ },
5408
+ {
5409
+ "epoch": 0.57,
5410
+ "grad_norm": 0.345703125,
5411
+ "learning_rate": 9.217993151700408e-05,
5412
+ "loss": 0.9021,
5413
+ "step": 3855
5414
+ },
5415
+ {
5416
+ "epoch": 0.57,
5417
+ "grad_norm": 0.345703125,
5418
+ "learning_rate": 9.19215445493994e-05,
5419
+ "loss": 0.9367,
5420
+ "step": 3860
5421
+ },
5422
+ {
5423
+ "epoch": 0.57,
5424
+ "grad_norm": 0.337890625,
5425
+ "learning_rate": 9.166321185963984e-05,
5426
+ "loss": 0.9301,
5427
+ "step": 3865
5428
+ },
5429
+ {
5430
+ "epoch": 0.57,
5431
+ "grad_norm": 0.349609375,
5432
+ "learning_rate": 9.140493518342113e-05,
5433
+ "loss": 0.9468,
5434
+ "step": 3870
5435
+ },
5436
+ {
5437
+ "epoch": 0.58,
5438
+ "grad_norm": 0.337890625,
5439
+ "learning_rate": 9.114671625606285e-05,
5440
+ "loss": 0.891,
5441
+ "step": 3875
5442
+ },
5443
+ {
5444
+ "epoch": 0.58,
5445
+ "grad_norm": 0.359375,
5446
+ "learning_rate": 9.088855681249658e-05,
5447
+ "loss": 0.938,
5448
+ "step": 3880
5449
+ },
5450
+ {
5451
+ "epoch": 0.58,
5452
+ "grad_norm": 0.34765625,
5453
+ "learning_rate": 9.063045858725406e-05,
5454
+ "loss": 0.9287,
5455
+ "step": 3885
5456
+ },
5457
+ {
5458
+ "epoch": 0.58,
5459
+ "grad_norm": 0.3359375,
5460
+ "learning_rate": 9.037242331445588e-05,
5461
+ "loss": 0.8992,
5462
+ "step": 3890
5463
+ },
5464
+ {
5465
+ "epoch": 0.58,
5466
+ "grad_norm": 0.333984375,
5467
+ "learning_rate": 9.011445272779962e-05,
5468
+ "loss": 0.9444,
5469
+ "step": 3895
5470
+ },
5471
+ {
5472
+ "epoch": 0.58,
5473
+ "grad_norm": 0.35546875,
5474
+ "learning_rate": 8.985654856054818e-05,
5475
+ "loss": 0.9119,
5476
+ "step": 3900
5477
+ },
5478
+ {
5479
+ "epoch": 0.58,
5480
+ "grad_norm": 0.34375,
5481
+ "learning_rate": 8.95987125455183e-05,
5482
+ "loss": 0.904,
5483
+ "step": 3905
5484
+ },
5485
+ {
5486
+ "epoch": 0.58,
5487
+ "grad_norm": 0.33984375,
5488
+ "learning_rate": 8.934094641506873e-05,
5489
+ "loss": 0.9223,
5490
+ "step": 3910
5491
+ },
5492
+ {
5493
+ "epoch": 0.58,
5494
+ "grad_norm": 0.349609375,
5495
+ "learning_rate": 8.908325190108873e-05,
5496
+ "loss": 0.9288,
5497
+ "step": 3915
5498
+ },
5499
+ {
5500
+ "epoch": 0.58,
5501
+ "grad_norm": 0.349609375,
5502
+ "learning_rate": 8.882563073498635e-05,
5503
+ "loss": 0.9177,
5504
+ "step": 3920
5505
+ },
5506
+ {
5507
+ "epoch": 0.58,
5508
+ "grad_norm": 0.359375,
5509
+ "learning_rate": 8.856808464767689e-05,
5510
+ "loss": 0.888,
5511
+ "step": 3925
5512
+ },
5513
+ {
5514
+ "epoch": 0.58,
5515
+ "grad_norm": 0.345703125,
5516
+ "learning_rate": 8.831061536957107e-05,
5517
+ "loss": 0.9174,
5518
+ "step": 3930
5519
+ },
5520
+ {
5521
+ "epoch": 0.58,
5522
+ "grad_norm": 0.359375,
5523
+ "learning_rate": 8.80532246305637e-05,
5524
+ "loss": 0.9345,
5525
+ "step": 3935
5526
+ },
5527
+ {
5528
+ "epoch": 0.59,
5529
+ "grad_norm": 0.333984375,
5530
+ "learning_rate": 8.779591416002179e-05,
5531
+ "loss": 0.926,
5532
+ "step": 3940
5533
+ },
5534
+ {
5535
+ "epoch": 0.59,
5536
+ "grad_norm": 0.349609375,
5537
+ "learning_rate": 8.753868568677311e-05,
5538
+ "loss": 0.9409,
5539
+ "step": 3945
5540
+ },
5541
+ {
5542
+ "epoch": 0.59,
5543
+ "grad_norm": 0.34375,
5544
+ "learning_rate": 8.728154093909441e-05,
5545
+ "loss": 0.918,
5546
+ "step": 3950
5547
+ },
5548
+ {
5549
+ "epoch": 0.59,
5550
+ "grad_norm": 0.36328125,
5551
+ "learning_rate": 8.702448164470007e-05,
5552
+ "loss": 0.9695,
5553
+ "step": 3955
5554
+ },
5555
+ {
5556
+ "epoch": 0.59,
5557
+ "grad_norm": 0.349609375,
5558
+ "learning_rate": 8.676750953073011e-05,
5559
+ "loss": 0.9221,
5560
+ "step": 3960
5561
+ },
5562
+ {
5563
+ "epoch": 0.59,
5564
+ "grad_norm": 0.341796875,
5565
+ "learning_rate": 8.65106263237389e-05,
5566
+ "loss": 0.9461,
5567
+ "step": 3965
5568
+ },
5569
+ {
5570
+ "epoch": 0.59,
5571
+ "grad_norm": 0.3515625,
5572
+ "learning_rate": 8.625383374968357e-05,
5573
+ "loss": 0.9107,
5574
+ "step": 3970
5575
+ },
5576
+ {
5577
+ "epoch": 0.59,
5578
+ "grad_norm": 0.359375,
5579
+ "learning_rate": 8.599713353391207e-05,
5580
+ "loss": 0.9238,
5581
+ "step": 3975
5582
+ },
5583
+ {
5584
+ "epoch": 0.59,
5585
+ "grad_norm": 0.34765625,
5586
+ "learning_rate": 8.574052740115201e-05,
5587
+ "loss": 0.9063,
5588
+ "step": 3980
5589
+ },
5590
+ {
5591
+ "epoch": 0.59,
5592
+ "grad_norm": 0.34765625,
5593
+ "learning_rate": 8.548401707549878e-05,
5594
+ "loss": 0.9457,
5595
+ "step": 3985
5596
+ },
5597
+ {
5598
+ "epoch": 0.59,
5599
+ "grad_norm": 0.353515625,
5600
+ "learning_rate": 8.522760428040402e-05,
5601
+ "loss": 0.9385,
5602
+ "step": 3990
5603
+ },
5604
+ {
5605
+ "epoch": 0.59,
5606
+ "grad_norm": 0.341796875,
5607
+ "learning_rate": 8.49712907386642e-05,
5608
+ "loss": 0.9101,
5609
+ "step": 3995
5610
+ },
5611
+ {
5612
+ "epoch": 0.59,
5613
+ "grad_norm": 0.34375,
5614
+ "learning_rate": 8.471507817240882e-05,
5615
+ "loss": 0.9426,
5616
+ "step": 4000
5617
  }
5618
  ],
5619
  "logging_steps": 5,
 
5621
  "num_input_tokens_seen": 0,
5622
  "num_train_epochs": 1,
5623
  "save_steps": 100,
5624
+ "total_flos": 5.625036203930681e+18,
5625
  "train_batch_size": 2,
5626
  "trial_name": null,
5627
  "trial_params": null