timewanderer commited on
Commit
1fd16ed
1 Parent(s): 6b7b360

Training in progress, step 2500

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2224a23210a48d385404f1ee975648ba19f7baf4e99ecbaeb45ec308c4e8ac98
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac4d617504c7155f36b9ab5a25d6b4709211109b8d952185af212a471f22930
3
  size 268290900
run-3/checkpoint-1000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7dc069a706a506b11ab20901dc7bba81d30aebb240bc31a6eb93d221d4377cad
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10caa747cc9b055b61da3a870005eb9854639faf1659a2954a7623865e788ced
3
  size 268290900
run-3/checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3489d76896528cdab6dfbb9d57397fdb9e2e3ba0a8a9e7273c16fbbdaffdafb9
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9ccfb4b4789f10457afb99b1aabd35c5ff0b1a3606c647f48c026ef9e8352d6
3
  size 536643898
run-3/checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60ef01273aaa599804d51a93c6c0c61874ebd50ab9e135a21aab5f7cd0e6487a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c794bc4c67ef18245dd516031ce405ab557e4d551d225d8dd1e1abc0f2be8e33
3
  size 1064
run-3/checkpoint-1000/trainer_state.json CHANGED
@@ -10,50 +10,50 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5796774193548387,
14
- "eval_loss": 0.20986367762088776,
15
- "eval_runtime": 5.1673,
16
- "eval_samples_per_second": 599.927,
17
- "eval_steps_per_second": 12.579,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5460540056228638,
23
- "learning_rate": 1.4758909853249476e-05,
24
- "loss": 0.332,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8129032258064516,
30
- "eval_loss": 0.1050410121679306,
31
- "eval_runtime": 5.3586,
32
- "eval_samples_per_second": 578.505,
33
- "eval_steps_per_second": 12.13,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8709677419354839,
39
- "eval_loss": 0.07279336452484131,
40
- "eval_runtime": 5.2066,
41
- "eval_samples_per_second": 595.399,
42
- "eval_steps_per_second": 12.484,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4723931849002838,
48
- "learning_rate": 9.517819706498952e-06,
49
- "loss": 0.1214,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
- "max_steps": 1908,
55
  "num_input_tokens_seen": 0,
56
- "num_train_epochs": 6,
57
  "save_steps": 500,
58
  "stateful_callbacks": {
59
  "TrainerControl": {
@@ -71,8 +71,8 @@
71
  "train_batch_size": 48,
72
  "trial_name": null,
73
  "trial_params": {
74
- "alpha": 0.9907493566825466,
75
- "num_train_epochs": 6,
76
- "temperature": 9
77
  }
78
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5803225806451613,
14
+ "eval_loss": 0.19005867838859558,
15
+ "eval_runtime": 5.674,
16
+ "eval_samples_per_second": 546.354,
17
+ "eval_steps_per_second": 11.456,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5149380564689636,
23
+ "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3072,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0924694836139679,
31
+ "eval_runtime": 6.0294,
32
+ "eval_samples_per_second": 514.15,
33
+ "eval_steps_per_second": 10.781,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8770967741935484,
39
+ "eval_loss": 0.06209348514676094,
40
+ "eval_runtime": 5.7978,
41
+ "eval_samples_per_second": 534.685,
42
+ "eval_steps_per_second": 11.211,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.4281909167766571,
48
+ "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1072,
50
  "step": 1000
51
  }
52
  ],
53
  "logging_steps": 500,
54
+ "max_steps": 3180,
55
  "num_input_tokens_seen": 0,
56
+ "num_train_epochs": 10,
57
  "save_steps": 500,
58
  "stateful_callbacks": {
59
  "TrainerControl": {
 
71
  "train_batch_size": 48,
72
  "trial_name": null,
73
  "trial_params": {
74
+ "alpha": 0.7637668053146042,
75
+ "num_train_epochs": 10,
76
+ "temperature": 18
77
  }
78
  }
run-3/checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a85143060eb3d68597fa3be18a06447136e3eefcb180aa3b82a2a8463f5f692a
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ceb3fabcbf71cb66944e6d8ec61e55f12751f2e6805da39ec868f61cfabfda
3
  size 5240
run-3/checkpoint-1500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7dfbfcd7ae1710dc78f86029d19911aebcbf6263be42f5e1fad4d67a84af60e3
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e77ed91a39a7cfbd38ba21e4c29b57be04d44cc975ec8f89e4c9d7fa45dbaaf6
3
  size 268290900
run-3/checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:36c4fbd6b433cb76cfb9c82d6958f6d74f64d6bee7012fa72823d27b3cea8048
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:308eb28881ddbf364fc834b2998ed4a5228666e33d4a58332e3280f3506c9079
3
  size 536643898
run-3/checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55c8d3ce0734337fc0c187ca5543b4c70ca45d996531f199209b3a0c2a798109
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71453465aad25f4c5a0a948496c64b1f74df850abda497954afe3695c00756ee
3
  size 1064
run-3/checkpoint-1500/trainer_state.json CHANGED
@@ -10,66 +10,66 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.5796774193548387,
14
- "eval_loss": 0.20986367762088776,
15
- "eval_runtime": 5.1673,
16
- "eval_samples_per_second": 599.927,
17
- "eval_steps_per_second": 12.579,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5460540056228638,
23
- "learning_rate": 1.4758909853249476e-05,
24
- "loss": 0.332,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8129032258064516,
30
- "eval_loss": 0.1050410121679306,
31
- "eval_runtime": 5.3586,
32
- "eval_samples_per_second": 578.505,
33
- "eval_steps_per_second": 12.13,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8709677419354839,
39
- "eval_loss": 0.07279336452484131,
40
- "eval_runtime": 5.2066,
41
- "eval_samples_per_second": 595.399,
42
- "eval_steps_per_second": 12.484,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4723931849002838,
48
- "learning_rate": 9.517819706498952e-06,
49
- "loss": 0.1214,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.892258064516129,
55
- "eval_loss": 0.058892734348773956,
56
- "eval_runtime": 5.2068,
57
- "eval_samples_per_second": 595.377,
58
- "eval_steps_per_second": 12.484,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.3437352180480957,
64
- "learning_rate": 4.276729559748428e-06,
65
- "loss": 0.0826,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
- "max_steps": 1908,
71
  "num_input_tokens_seen": 0,
72
- "num_train_epochs": 6,
73
  "save_steps": 500,
74
  "stateful_callbacks": {
75
  "TrainerControl": {
@@ -87,8 +87,8 @@
87
  "train_batch_size": 48,
88
  "trial_name": null,
89
  "trial_params": {
90
- "alpha": 0.9907493566825466,
91
- "num_train_epochs": 6,
92
- "temperature": 9
93
  }
94
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5803225806451613,
14
+ "eval_loss": 0.19005867838859558,
15
+ "eval_runtime": 5.674,
16
+ "eval_samples_per_second": 546.354,
17
+ "eval_steps_per_second": 11.456,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5149380564689636,
23
+ "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3072,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0924694836139679,
31
+ "eval_runtime": 6.0294,
32
+ "eval_samples_per_second": 514.15,
33
+ "eval_steps_per_second": 10.781,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8770967741935484,
39
+ "eval_loss": 0.06209348514676094,
40
+ "eval_runtime": 5.7978,
41
+ "eval_samples_per_second": 534.685,
42
+ "eval_steps_per_second": 11.211,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.4281909167766571,
48
+ "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1072,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9009677419354839,
55
+ "eval_loss": 0.048012129962444305,
56
+ "eval_runtime": 5.7482,
57
+ "eval_samples_per_second": 539.302,
58
+ "eval_steps_per_second": 11.308,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.2933551073074341,
64
+ "learning_rate": 1.0566037735849058e-05,
65
+ "loss": 0.0684,
66
  "step": 1500
67
  }
68
  ],
69
  "logging_steps": 500,
70
+ "max_steps": 3180,
71
  "num_input_tokens_seen": 0,
72
+ "num_train_epochs": 10,
73
  "save_steps": 500,
74
  "stateful_callbacks": {
75
  "TrainerControl": {
 
87
  "train_batch_size": 48,
88
  "trial_name": null,
89
  "trial_params": {
90
+ "alpha": 0.7637668053146042,
91
+ "num_train_epochs": 10,
92
+ "temperature": 18
93
  }
94
  }
run-3/checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a85143060eb3d68597fa3be18a06447136e3eefcb180aa3b82a2a8463f5f692a
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ceb3fabcbf71cb66944e6d8ec61e55f12751f2e6805da39ec868f61cfabfda
3
  size 5240
run-3/checkpoint-2000/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9dd8891bb979d2df7a3486f993cea9be9bd846e0af2b9f59c482859415fb9ee9
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5826cd7e1aa88532733104f10a3c130d62ddac8febb6ccaf66bcca270f075d63
3
  size 268290900
run-3/checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:37e31aa19ee84210ac75899eb23c1dbc98684f7a512c37e4e1c35ef870c29ddd
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e16ae3f53f18fc497fdd42f7ddebf504a6808472292d9fa664baa86991b55fb
3
  size 536643898
run-3/checkpoint-2000/trainer_state.json CHANGED
@@ -10,84 +10,84 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.6129032258064516,
14
- "eval_loss": 0.22857815027236938,
15
- "eval_runtime": 5.1291,
16
- "eval_samples_per_second": 604.398,
17
- "eval_steps_per_second": 12.673,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5715988874435425,
23
  "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.368,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8406451612903226,
30
- "eval_loss": 0.10308429598808289,
31
- "eval_runtime": 5.3537,
32
- "eval_samples_per_second": 579.034,
33
- "eval_steps_per_second": 12.141,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8909677419354839,
39
- "eval_loss": 0.06482071429491043,
40
- "eval_runtime": 5.1876,
41
- "eval_samples_per_second": 597.576,
42
- "eval_steps_per_second": 12.53,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4861523509025574,
48
  "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1205,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.9032258064516129,
55
- "eval_loss": 0.04816382750868797,
56
- "eval_runtime": 5.1502,
57
- "eval_samples_per_second": 601.919,
58
- "eval_steps_per_second": 12.621,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.3236384689807892,
64
  "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0727,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9145161290322581,
71
- "eval_loss": 0.0385914221405983,
72
- "eval_runtime": 5.275,
73
- "eval_samples_per_second": 587.677,
74
- "eval_steps_per_second": 12.322,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.922258064516129,
80
- "eval_loss": 0.03349597379565239,
81
- "eval_runtime": 5.1757,
82
- "eval_samples_per_second": 598.956,
83
- "eval_steps_per_second": 12.559,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.27145814895629883,
89
  "learning_rate": 7.421383647798742e-06,
90
- "loss": 0.0555,
91
  "step": 2000
92
  }
93
  ],
@@ -112,8 +112,8 @@
112
  "train_batch_size": 48,
113
  "trial_name": null,
114
  "trial_params": {
115
- "alpha": 0.34170044466363136,
116
  "num_train_epochs": 10,
117
- "temperature": 5
118
  }
119
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5803225806451613,
14
+ "eval_loss": 0.19005867838859558,
15
+ "eval_runtime": 5.674,
16
+ "eval_samples_per_second": 546.354,
17
+ "eval_steps_per_second": 11.456,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5149380564689636,
23
  "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3072,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0924694836139679,
31
+ "eval_runtime": 6.0294,
32
+ "eval_samples_per_second": 514.15,
33
+ "eval_steps_per_second": 10.781,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8770967741935484,
39
+ "eval_loss": 0.06209348514676094,
40
+ "eval_runtime": 5.7978,
41
+ "eval_samples_per_second": 534.685,
42
+ "eval_steps_per_second": 11.211,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.4281909167766571,
48
  "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1072,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9009677419354839,
55
+ "eval_loss": 0.048012129962444305,
56
+ "eval_runtime": 5.7482,
57
+ "eval_samples_per_second": 539.302,
58
+ "eval_steps_per_second": 11.308,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.2933551073074341,
64
  "learning_rate": 1.0566037735849058e-05,
65
+ "loss": 0.0684,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9093548387096774,
71
+ "eval_loss": 0.03922256454825401,
72
+ "eval_runtime": 5.9235,
73
+ "eval_samples_per_second": 523.338,
74
+ "eval_steps_per_second": 10.973,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.912258064516129,
80
+ "eval_loss": 0.03422800451517105,
81
+ "eval_runtime": 5.7032,
82
+ "eval_samples_per_second": 543.553,
83
+ "eval_steps_per_second": 11.397,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.25101438164711,
89
  "learning_rate": 7.421383647798742e-06,
90
+ "loss": 0.0534,
91
  "step": 2000
92
  }
93
  ],
 
112
  "train_batch_size": 48,
113
  "trial_name": null,
114
  "trial_params": {
115
+ "alpha": 0.7637668053146042,
116
  "num_train_epochs": 10,
117
+ "temperature": 18
118
  }
119
  }
run-3/checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13f3c53667f2d0c994b1e0580a4240dd5f1920dd508edcd4cd8ca7ddb067f7f3
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ceb3fabcbf71cb66944e6d8ec61e55f12751f2e6805da39ec868f61cfabfda
3
  size 5240
run-3/checkpoint-2500/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fe21bc5ed4d44215424e54d6d4c27ba6f24b1a5d50938b550a5fdb16e6adb25f
3
  size 268290900
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ac4d617504c7155f36b9ab5a25d6b4709211109b8d952185af212a471f22930
3
  size 268290900
run-3/checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ebcdba7042cc47acadbf9741605302f4d74356806e9f6dda63f9b20b9e882862
3
  size 536643898
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d41e272b3c1cc2159216a635ef7226db231ca5c57104983a3bff55ace21198bf
3
  size 536643898
run-3/checkpoint-2500/trainer_state.json CHANGED
@@ -10,100 +10,100 @@
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
- "eval_accuracy": 0.6129032258064516,
14
- "eval_loss": 0.22857815027236938,
15
- "eval_runtime": 5.1291,
16
- "eval_samples_per_second": 604.398,
17
- "eval_steps_per_second": 12.673,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
- "grad_norm": 0.5715988874435425,
23
  "learning_rate": 1.685534591194969e-05,
24
- "loss": 0.368,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
- "eval_accuracy": 0.8406451612903226,
30
- "eval_loss": 0.10308429598808289,
31
- "eval_runtime": 5.3537,
32
- "eval_samples_per_second": 579.034,
33
- "eval_steps_per_second": 12.141,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
- "eval_accuracy": 0.8909677419354839,
39
- "eval_loss": 0.06482071429491043,
40
- "eval_runtime": 5.1876,
41
- "eval_samples_per_second": 597.576,
42
- "eval_steps_per_second": 12.53,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
- "grad_norm": 0.4861523509025574,
48
  "learning_rate": 1.371069182389937e-05,
49
- "loss": 0.1205,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
- "eval_accuracy": 0.9032258064516129,
55
- "eval_loss": 0.04816382750868797,
56
- "eval_runtime": 5.1502,
57
- "eval_samples_per_second": 601.919,
58
- "eval_steps_per_second": 12.621,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
- "grad_norm": 0.3236384689807892,
64
  "learning_rate": 1.0566037735849058e-05,
65
- "loss": 0.0727,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
- "eval_accuracy": 0.9145161290322581,
71
- "eval_loss": 0.0385914221405983,
72
- "eval_runtime": 5.275,
73
- "eval_samples_per_second": 587.677,
74
- "eval_steps_per_second": 12.322,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
- "eval_accuracy": 0.922258064516129,
80
- "eval_loss": 0.03349597379565239,
81
- "eval_runtime": 5.1757,
82
- "eval_samples_per_second": 598.956,
83
- "eval_steps_per_second": 12.559,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
- "grad_norm": 0.27145814895629883,
89
  "learning_rate": 7.421383647798742e-06,
90
- "loss": 0.0555,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
- "eval_accuracy": 0.927741935483871,
96
- "eval_loss": 0.030705930665135384,
97
- "eval_runtime": 5.2177,
98
- "eval_samples_per_second": 594.135,
99
- "eval_steps_per_second": 12.458,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
- "grad_norm": 0.2963285446166992,
105
  "learning_rate": 4.276729559748428e-06,
106
- "loss": 0.0477,
107
  "step": 2500
108
  }
109
  ],
@@ -128,8 +128,8 @@
128
  "train_batch_size": 48,
129
  "trial_name": null,
130
  "trial_params": {
131
- "alpha": 0.34170044466363136,
132
  "num_train_epochs": 10,
133
- "temperature": 5
134
  }
135
  }
 
10
  "log_history": [
11
  {
12
  "epoch": 1.0,
13
+ "eval_accuracy": 0.5803225806451613,
14
+ "eval_loss": 0.19005867838859558,
15
+ "eval_runtime": 5.674,
16
+ "eval_samples_per_second": 546.354,
17
+ "eval_steps_per_second": 11.456,
18
  "step": 318
19
  },
20
  {
21
  "epoch": 1.5723270440251573,
22
+ "grad_norm": 0.5149380564689636,
23
  "learning_rate": 1.685534591194969e-05,
24
+ "loss": 0.3072,
25
  "step": 500
26
  },
27
  {
28
  "epoch": 2.0,
29
+ "eval_accuracy": 0.817741935483871,
30
+ "eval_loss": 0.0924694836139679,
31
+ "eval_runtime": 6.0294,
32
+ "eval_samples_per_second": 514.15,
33
+ "eval_steps_per_second": 10.781,
34
  "step": 636
35
  },
36
  {
37
  "epoch": 3.0,
38
+ "eval_accuracy": 0.8770967741935484,
39
+ "eval_loss": 0.06209348514676094,
40
+ "eval_runtime": 5.7978,
41
+ "eval_samples_per_second": 534.685,
42
+ "eval_steps_per_second": 11.211,
43
  "step": 954
44
  },
45
  {
46
  "epoch": 3.1446540880503147,
47
+ "grad_norm": 0.4281909167766571,
48
  "learning_rate": 1.371069182389937e-05,
49
+ "loss": 0.1072,
50
  "step": 1000
51
  },
52
  {
53
  "epoch": 4.0,
54
+ "eval_accuracy": 0.9009677419354839,
55
+ "eval_loss": 0.048012129962444305,
56
+ "eval_runtime": 5.7482,
57
+ "eval_samples_per_second": 539.302,
58
+ "eval_steps_per_second": 11.308,
59
  "step": 1272
60
  },
61
  {
62
  "epoch": 4.716981132075472,
63
+ "grad_norm": 0.2933551073074341,
64
  "learning_rate": 1.0566037735849058e-05,
65
+ "loss": 0.0684,
66
  "step": 1500
67
  },
68
  {
69
  "epoch": 5.0,
70
+ "eval_accuracy": 0.9093548387096774,
71
+ "eval_loss": 0.03922256454825401,
72
+ "eval_runtime": 5.9235,
73
+ "eval_samples_per_second": 523.338,
74
+ "eval_steps_per_second": 10.973,
75
  "step": 1590
76
  },
77
  {
78
  "epoch": 6.0,
79
+ "eval_accuracy": 0.912258064516129,
80
+ "eval_loss": 0.03422800451517105,
81
+ "eval_runtime": 5.7032,
82
+ "eval_samples_per_second": 543.553,
83
+ "eval_steps_per_second": 11.397,
84
  "step": 1908
85
  },
86
  {
87
  "epoch": 6.289308176100629,
88
+ "grad_norm": 0.25101438164711,
89
  "learning_rate": 7.421383647798742e-06,
90
+ "loss": 0.0534,
91
  "step": 2000
92
  },
93
  {
94
  "epoch": 7.0,
95
+ "eval_accuracy": 0.9209677419354839,
96
+ "eval_loss": 0.0315035916864872,
97
+ "eval_runtime": 5.8633,
98
+ "eval_samples_per_second": 528.715,
99
+ "eval_steps_per_second": 11.086,
100
  "step": 2226
101
  },
102
  {
103
  "epoch": 7.861635220125786,
104
+ "grad_norm": 0.26583123207092285,
105
  "learning_rate": 4.276729559748428e-06,
106
+ "loss": 0.0462,
107
  "step": 2500
108
  }
109
  ],
 
128
  "train_batch_size": 48,
129
  "trial_name": null,
130
  "trial_params": {
131
+ "alpha": 0.7637668053146042,
132
  "num_train_epochs": 10,
133
+ "temperature": 18
134
  }
135
  }
run-3/checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:13f3c53667f2d0c994b1e0580a4240dd5f1920dd508edcd4cd8ca7ddb067f7f3
3
  size 5240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33ceb3fabcbf71cb66944e6d8ec61e55f12751f2e6805da39ec868f61cfabfda
3
  size 5240
runs/Oct11_19-31-02_821d3e23518d/events.out.tfevents.1728678968.821d3e23518d.3094.4 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c2aa85cff79bf33e44bc2f5ad95e9335a79352cc5df863ac98300dba49dfc47
3
- size 13833
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3365839880123b6421cc311dfcd464aa63bad20c4baa74fe0cb1f55cbf19149a
3
+ size 17292