DorinSht commited on
Commit
5d9ef80
1 Parent(s): 1d7c22a

End of training

Browse files
README.md CHANGED
@@ -15,10 +15,10 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # recreate_llama_68M_vanilla
17
 
18
- This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
- - Loss: 2.3603
21
- - Accuracy: 0.5811
22
 
23
  ## Model description
24
 
 
15
 
16
  # recreate_llama_68M_vanilla
17
 
18
+ This model is a fine-tuned version of [JackFram/llama-68m](https://huggingface.co/JackFram/llama-68m) on the anon8231489123/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json dataset.
19
  It achieves the following results on the evaluation set:
20
+ - Loss: 2.3558
21
+ - Accuracy: 0.5820
22
 
23
  ## Model description
24
 
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.45896433805569126,
4
- "eval_loss": 4.308589935302734,
5
- "eval_runtime": 2.8119,
6
- "eval_samples": 10,
7
- "eval_samples_per_second": 3.556,
8
- "eval_steps_per_second": 0.356,
9
- "perplexity": 74.33559700443098,
10
- "total_flos": 1601895923712000.0,
11
- "train_loss": 4.819753979879712,
12
- "train_runtime": 153.0154,
13
- "train_samples": 1000,
14
- "train_samples_per_second": 19.606,
15
- "train_steps_per_second": 1.647
16
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.5819606104373314,
4
+ "eval_loss": 2.3558192253112793,
5
+ "eval_runtime": 128.698,
6
+ "eval_samples": 1840,
7
+ "eval_samples_per_second": 14.297,
8
+ "eval_steps_per_second": 0.303,
9
+ "perplexity": 10.546765500786147,
10
+ "total_flos": 1.4536404559724544e+17,
11
+ "train_loss": 2.5941595100713495,
12
+ "train_runtime": 20556.3593,
13
+ "train_samples": 90745,
14
+ "train_samples_per_second": 13.243,
15
+ "train_steps_per_second": 0.552
16
  }
args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da0612c3c9d86d6249df50bee087bc8118d35203b4807ae34f25b74705525d44
3
  size 6036
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:201594a79150cdbed9448595ad3b468d7c43b4a56fba8ab6f5d555f487e2bab9
3
  size 6036
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
  "epoch": 3.0,
3
- "eval_accuracy": 0.45896433805569126,
4
- "eval_loss": 4.308589935302734,
5
- "eval_runtime": 2.8119,
6
- "eval_samples": 10,
7
- "eval_samples_per_second": 3.556,
8
- "eval_steps_per_second": 0.356,
9
- "perplexity": 74.33559700443098
10
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "eval_accuracy": 0.5819606104373314,
4
+ "eval_loss": 2.3558192253112793,
5
+ "eval_runtime": 128.698,
6
+ "eval_samples": 1840,
7
+ "eval_samples_per_second": 14.297,
8
+ "eval_steps_per_second": 0.303,
9
+ "perplexity": 10.546765500786147
10
  }
events.out.tfevents.1717508814.isl-gpu33.2434801.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59238b0247d9a11cebfcaadb2151c6567d768aafe1085c694aab66aa014c757b
3
+ size 411
log.txt CHANGED
@@ -1018,3 +1018,44 @@ Training completed. Do not forget to share your model on huggingface.co/models =
1018
 
1019
 
1020
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1021
  0%| | 0/39 [00:00<?, ?it/s]
1022
  5%|▌ | 2/39 [00:02<00:54, 1.48s/it]
1023
  8%|▊ | 3/39 [00:05<01:15, 2.10s/it]
1024
  10%|█ | 4/39 [00:08<01:24, 2.43s/it]
1025
  13%|█▎ | 5/39 [00:11<01:28, 2.62s/it]
1026
  15%|█▌ | 6/39 [00:14<01:30, 2.73s/it]
1027
  18%|█▊ | 7/39 [00:17<01:29, 2.81s/it]
1028
  21%|██ | 8/39 [00:20<01:28, 2.86s/it]
1029
  23%|██▎ | 9/39 [00:23<01:26, 2.90s/it]
1030
  26%|██▌ | 10/39 [00:26<01:24, 2.92s/it]
1031
  28%|██▊ | 11/39 [00:29<01:22, 2.93s/it]
1032
  31%|███ | 12/39 [00:32<01:19, 2.94s/it]
1033
  33%|███▎ | 13/39 [00:35<01:16, 2.95s/it]
1034
  36%|███▌ | 14/39 [00:38<01:13, 2.96s/it]
1035
  38%|███▊ | 15/39 [00:41<01:11, 2.96s/it]
1036
  41%|████ | 16/39 [00:44<01:08, 2.96s/it]
1037
  44%|████▎ | 17/39 [00:47<01:05, 2.97s/it]
1038
  46%|████▌ | 18/39 [00:50<01:02, 2.97s/it]
1039
  49%|████▊ | 19/39 [00:53<00:59, 2.97s/it]
1040
  51%|█████▏ | 20/39 [00:56<00:56, 2.97s/it]
1041
  54%|█████▍ | 21/39 [00:59<00:53, 2.97s/it]
1042
  56%|█████▋ | 22/39 [01:02<00:50, 2.97s/it]
1043
  59%|█████▉ | 23/39 [01:05<00:47, 2.97s/it]
1044
  62%|██████▏ | 24/39 [01:08<00:44, 2.97s/it]
1045
  64%|██████▍ | 25/39 [01:11<00:41, 2.97s/it]
1046
  67%|██████▋ | 26/39 [01:14<00:38, 2.97s/it]
1047
  69%|██████▉ | 27/39 [01:17<00:35, 2.97s/it]
1048
  72%|███████▏ | 28/39 [01:20<00:32, 2.97s/it]
1049
  74%|███████▍ | 29/39 [01:23<00:29, 2.97s/it]
1050
  77%|███████▋ | 30/39 [01:26<00:26, 2.97s/it]
1051
  79%|███████▉ | 31/39 [01:29<00:23, 2.97s/it]
1052
  82%|████████▏ | 32/39 [01:32<00:20, 2.97s/it]
1053
  85%|████████▍ | 33/39 [01:35<00:17, 2.97s/it]
1054
  87%|████████▋ | 34/39 [01:38<00:14, 2.97s/it]
1055
  90%|████████▉ | 35/39 [01:41<00:11, 2.97s/it]
1056
  92%|█████████▏| 36/39 [01:43<00:08, 2.97s/it]
1057
  95%|█████████▍| 37/39 [01:46<00:05, 2.97s/it]
1058
  97%|█████████▋| 38/39 [01:49<00:02, 2.94s/it]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1018
 
1019
 
1020
 
1021
+ ***** train metrics *****
1022
+ epoch = 3.0
1023
+ total_flos = 135380817GF
1024
+ train_loss = 2.5942
1025
+ train_runtime = 5:42:36.35
1026
+ train_samples = 90745
1027
+ train_samples_per_second = 13.243
1028
+ train_steps_per_second = 0.552
1029
+ 06/04/2024 06:44:45 - INFO - __main__ - *** Evaluate ***
1030
+ [INFO|trainer.py:3662] 2024-06-04 06:44:45,746 >> ***** Running Evaluation *****
1031
+ [INFO|trainer.py:3664] 2024-06-04 06:44:45,746 >> Num examples = 1840
1032
+ [INFO|trainer.py:3667] 2024-06-04 06:44:45,746 >> Batch size = 48
1033
+ /home/dshteyma/miniconda3/lib/python3.9/site-packages/torch/nn/parallel/_functions.py:68: UserWarning: Was asked to gather along dimension 0, but all input tensors were scalars; will instead unsqueeze and return a vector.
1034
+ warnings.warn('Was asked to gather along dimension 0, but all '
1035
+
1036
  0%| | 0/39 [00:00<?, ?it/s]
1037
  5%|▌ | 2/39 [00:02<00:54, 1.48s/it]
1038
  8%|▊ | 3/39 [00:05<01:15, 2.10s/it]
1039
  10%|█ | 4/39 [00:08<01:24, 2.43s/it]
1040
  13%|█▎ | 5/39 [00:11<01:28, 2.62s/it]
1041
  15%|█▌ | 6/39 [00:14<01:30, 2.73s/it]
1042
  18%|█▊ | 7/39 [00:17<01:29, 2.81s/it]
1043
  21%|██ | 8/39 [00:20<01:28, 2.86s/it]
1044
  23%|██▎ | 9/39 [00:23<01:26, 2.90s/it]
1045
  26%|██▌ | 10/39 [00:26<01:24, 2.92s/it]
1046
  28%|██▊ | 11/39 [00:29<01:22, 2.93s/it]
1047
  31%|███ | 12/39 [00:32<01:19, 2.94s/it]
1048
  33%|███▎ | 13/39 [00:35<01:16, 2.95s/it]
1049
  36%|███▌ | 14/39 [00:38<01:13, 2.96s/it]
1050
  38%|███▊ | 15/39 [00:41<01:11, 2.96s/it]
1051
  41%|████ | 16/39 [00:44<01:08, 2.96s/it]
1052
  44%|████▎ | 17/39 [00:47<01:05, 2.97s/it]
1053
  46%|████▌ | 18/39 [00:50<01:02, 2.97s/it]
1054
  49%|████▊ | 19/39 [00:53<00:59, 2.97s/it]
1055
  51%|█████▏ | 20/39 [00:56<00:56, 2.97s/it]
1056
  54%|█████▍ | 21/39 [00:59<00:53, 2.97s/it]
1057
  56%|█████▋ | 22/39 [01:02<00:50, 2.97s/it]
1058
  59%|█████▉ | 23/39 [01:05<00:47, 2.97s/it]
1059
  62%|██████▏ | 24/39 [01:08<00:44, 2.97s/it]
1060
  64%|██████▍ | 25/39 [01:11<00:41, 2.97s/it]
1061
  67%|██████▋ | 26/39 [01:14<00:38, 2.97s/it]
1062
  69%|██████▉ | 27/39 [01:17<00:35, 2.97s/it]
1063
  72%|███████▏ | 28/39 [01:20<00:32, 2.97s/it]
1064
  74%|███████▍ | 29/39 [01:23<00:29, 2.97s/it]
1065
  77%|███████▋ | 30/39 [01:26<00:26, 2.97s/it]
1066
  79%|███████▉ | 31/39 [01:29<00:23, 2.97s/it]
1067
  82%|████████▏ | 32/39 [01:32<00:20, 2.97s/it]
1068
  85%|████████▍ | 33/39 [01:35<00:17, 2.97s/it]
1069
  87%|████████▋ | 34/39 [01:38<00:14, 2.97s/it]
1070
  90%|████████▉ | 35/39 [01:41<00:11, 2.97s/it]
1071
  92%|█████████▏| 36/39 [01:43<00:08, 2.97s/it]
1072
  95%|█████████▍| 37/39 [01:46<00:05, 2.97s/it]
1073
  97%|█████████▋| 38/39 [01:49<00:02, 2.94s/it]
1074
+ [INFO|trainer.py:3353] 2024-06-04 06:46:54,461 >> Saving model checkpoint to ./training_outputs_job_116987_1_04-06_01-01
1075
+ [INFO|configuration_utils.py:471] 2024-06-04 06:46:54,473 >> Configuration saved in ./training_outputs_job_116987_1_04-06_01-01/config.json
1076
+ [INFO|configuration_utils.py:705] 2024-06-04 06:46:54,478 >> Configuration saved in ./training_outputs_job_116987_1_04-06_01-01/generation_config.json
1077
+ [INFO|modeling_utils.py:2592] 2024-06-04 06:46:55,425 >> Model weights saved in ./training_outputs_job_116987_1_04-06_01-01/model.safetensors
1078
+ [INFO|tokenization_utils_base.py:2503] 2024-06-04 06:46:55,436 >> tokenizer config file saved in ./training_outputs_job_116987_1_04-06_01-01/tokenizer_config.json
1079
+ [INFO|tokenization_utils_base.py:2512] 2024-06-04 06:46:55,440 >> Special tokens file saved in ./training_outputs_job_116987_1_04-06_01-01/special_tokens_map.json
1080
+ [INFO|modelcard.py:450] 2024-06-04 06:46:55,614 >> Dropping the following result as it does not have all the necessary fields:
1081
+ {'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.5819606104373314}]}
1082
+ ***** eval metrics *****
1083
+ epoch = 3.0
1084
+ eval_accuracy = 0.582
1085
+ eval_loss = 2.3558
1086
+ eval_runtime = 0:02:08.69
1087
+ eval_samples = 1840
1088
+ eval_samples_per_second = 14.297
1089
+ eval_steps_per_second = 0.303
1090
+ perplexity = 10.5468
1091
+
1092
+
1093
+
1094
+
1095
+
1096
+
1097
+
1098
+
1099
+
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 3.0,
3
- "total_flos": 1601895923712000.0,
4
- "train_loss": 4.819753979879712,
5
- "train_runtime": 153.0154,
6
- "train_samples": 1000,
7
- "train_samples_per_second": 19.606,
8
- "train_steps_per_second": 1.647
9
  }
 
1
  {
2
  "epoch": 3.0,
3
+ "total_flos": 1.4536404559724544e+17,
4
+ "train_loss": 2.5941595100713495,
5
+ "train_runtime": 20556.3593,
6
+ "train_samples": 90745,
7
+ "train_samples_per_second": 13.243,
8
+ "train_steps_per_second": 0.552
9
  }
trainer_state.json CHANGED
@@ -3,40 +3,293 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 1000,
6
- "global_step": 252,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 3.0,
13
- "step": 252,
14
- "total_flos": 1601895923712000.0,
15
- "train_loss": 4.819753979879712,
16
- "train_runtime": 153.0154,
17
- "train_samples_per_second": 19.606,
18
- "train_steps_per_second": 1.647
19
  }
20
  ],
21
  "logging_steps": 500,
22
- "max_steps": 252,
23
  "num_input_tokens_seen": 0,
24
  "num_train_epochs": 3,
25
- "save_steps": 10000,
26
  "stateful_callbacks": {
27
  "TrainerControl": {
28
  "args": {
29
  "should_epoch_stop": false,
30
  "should_evaluate": false,
31
  "should_log": false,
32
- "should_save": false,
33
  "should_training_stop": false
34
  },
35
  "attributes": {}
36
  }
37
  },
38
- "total_flos": 1601895923712000.0,
39
- "train_batch_size": 12,
40
  "trial_name": null,
41
  "trial_params": null
42
  }
 
3
  "best_model_checkpoint": null,
4
  "epoch": 3.0,
5
  "eval_steps": 1000,
6
+ "global_step": 11346,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
+ {
12
+ "epoch": 0.13220518244315177,
13
+ "grad_norm": 0.8546377420425415,
14
+ "learning_rate": 8.816009873931059e-05,
15
+ "loss": 5.1118,
16
+ "step": 500
17
+ },
18
+ {
19
+ "epoch": 0.26441036488630354,
20
+ "grad_norm": 0.8593683838844299,
21
+ "learning_rate": 9.59831475011252e-05,
22
+ "loss": 3.406,
23
+ "step": 1000
24
+ },
25
+ {
26
+ "epoch": 0.26441036488630354,
27
+ "eval_accuracy": 0.5035308829464115,
28
+ "eval_loss": 3.23445987701416,
29
+ "eval_runtime": 128.969,
30
+ "eval_samples_per_second": 14.267,
31
+ "eval_steps_per_second": 0.302,
32
+ "step": 1000
33
+ },
34
+ {
35
+ "epoch": 0.3966155473294553,
36
+ "grad_norm": 0.9617242217063904,
37
+ "learning_rate": 9.134314230431938e-05,
38
+ "loss": 3.0005,
39
+ "step": 1500
40
+ },
41
+ {
42
+ "epoch": 0.5288207297726071,
43
+ "grad_norm": 0.8956136107444763,
44
+ "learning_rate": 8.670313710751356e-05,
45
+ "loss": 2.8119,
46
+ "step": 2000
47
+ },
48
+ {
49
+ "epoch": 0.5288207297726071,
50
+ "eval_accuracy": 0.5364783564495231,
51
+ "eval_loss": 2.821624517440796,
52
+ "eval_runtime": 128.8547,
53
+ "eval_samples_per_second": 14.28,
54
+ "eval_steps_per_second": 0.303,
55
+ "step": 2000
56
+ },
57
+ {
58
+ "epoch": 0.6610259122157589,
59
+ "grad_norm": 1.2592207193374634,
60
+ "learning_rate": 8.206313191070773e-05,
61
+ "loss": 2.6861,
62
+ "step": 2500
63
+ },
64
+ {
65
+ "epoch": 0.7932310946589106,
66
+ "grad_norm": 1.3978203535079956,
67
+ "learning_rate": 7.742312671390191e-05,
68
+ "loss": 2.6076,
69
+ "step": 3000
70
+ },
71
+ {
72
+ "epoch": 0.7932310946589106,
73
+ "eval_accuracy": 0.5501396529385527,
74
+ "eval_loss": 2.6552908420562744,
75
+ "eval_runtime": 129.1173,
76
+ "eval_samples_per_second": 14.251,
77
+ "eval_steps_per_second": 0.302,
78
+ "step": 3000
79
+ },
80
+ {
81
+ "epoch": 0.9254362771020624,
82
+ "grad_norm": 1.8927521705627441,
83
+ "learning_rate": 7.278312151709609e-05,
84
+ "loss": 2.5643,
85
+ "step": 3500
86
+ },
87
+ {
88
+ "epoch": 1.0576414595452142,
89
+ "grad_norm": 4.235791206359863,
90
+ "learning_rate": 6.814311632029027e-05,
91
+ "loss": 2.4729,
92
+ "step": 4000
93
+ },
94
+ {
95
+ "epoch": 1.0576414595452142,
96
+ "eval_accuracy": 0.5581473949151462,
97
+ "eval_loss": 2.5761468410491943,
98
+ "eval_runtime": 129.3204,
99
+ "eval_samples_per_second": 14.228,
100
+ "eval_steps_per_second": 0.302,
101
+ "step": 4000
102
+ },
103
+ {
104
+ "epoch": 1.189846641988366,
105
+ "grad_norm": 2.753159523010254,
106
+ "learning_rate": 6.350311112348446e-05,
107
+ "loss": 2.4531,
108
+ "step": 4500
109
+ },
110
+ {
111
+ "epoch": 1.3220518244315178,
112
+ "grad_norm": 3.542167901992798,
113
+ "learning_rate": 5.886310592667864e-05,
114
+ "loss": 2.4323,
115
+ "step": 5000
116
+ },
117
+ {
118
+ "epoch": 1.3220518244315178,
119
+ "eval_accuracy": 0.5616867738578195,
120
+ "eval_loss": 2.536275863647461,
121
+ "eval_runtime": 129.4842,
122
+ "eval_samples_per_second": 14.21,
123
+ "eval_steps_per_second": 0.301,
124
+ "step": 5000
125
+ },
126
+ {
127
+ "epoch": 1.4542570068746694,
128
+ "grad_norm": 3.678074598312378,
129
+ "learning_rate": 5.422310072987282e-05,
130
+ "loss": 2.3981,
131
+ "step": 5500
132
+ },
133
+ {
134
+ "epoch": 1.5864621893178212,
135
+ "grad_norm": 5.024658679962158,
136
+ "learning_rate": 4.9583095533066995e-05,
137
+ "loss": 2.3824,
138
+ "step": 6000
139
+ },
140
+ {
141
+ "epoch": 1.5864621893178212,
142
+ "eval_accuracy": 0.5660391134427901,
143
+ "eval_loss": 2.491274118423462,
144
+ "eval_runtime": 129.1418,
145
+ "eval_samples_per_second": 14.248,
146
+ "eval_steps_per_second": 0.302,
147
+ "step": 6000
148
+ },
149
+ {
150
+ "epoch": 1.718667371760973,
151
+ "grad_norm": 5.106564521789551,
152
+ "learning_rate": 4.4943090336261176e-05,
153
+ "loss": 2.3763,
154
+ "step": 6500
155
+ },
156
+ {
157
+ "epoch": 1.8508725542041247,
158
+ "grad_norm": 11.649778366088867,
159
+ "learning_rate": 4.030308513945535e-05,
160
+ "loss": 2.3719,
161
+ "step": 7000
162
+ },
163
+ {
164
+ "epoch": 1.8508725542041247,
165
+ "eval_accuracy": 0.5685828678235382,
166
+ "eval_loss": 2.466362237930298,
167
+ "eval_runtime": 128.8316,
168
+ "eval_samples_per_second": 14.282,
169
+ "eval_steps_per_second": 0.303,
170
+ "step": 7000
171
+ },
172
+ {
173
+ "epoch": 1.9830777366472767,
174
+ "grad_norm": 5.842829704284668,
175
+ "learning_rate": 3.566307994264953e-05,
176
+ "loss": 2.3411,
177
+ "step": 7500
178
+ },
179
+ {
180
+ "epoch": 2.1152829190904283,
181
+ "grad_norm": 14.795243263244629,
182
+ "learning_rate": 3.1023074745843715e-05,
183
+ "loss": 2.3021,
184
+ "step": 8000
185
+ },
186
+ {
187
+ "epoch": 2.1152829190904283,
188
+ "eval_accuracy": 0.5715702725090801,
189
+ "eval_loss": 2.4403789043426514,
190
+ "eval_runtime": 128.3806,
191
+ "eval_samples_per_second": 14.332,
192
+ "eval_steps_per_second": 0.304,
193
+ "step": 8000
194
+ },
195
+ {
196
+ "epoch": 2.24748810153358,
197
+ "grad_norm": 6.084632873535156,
198
+ "learning_rate": 2.6383069549037897e-05,
199
+ "loss": 2.2897,
200
+ "step": 8500
201
+ },
202
+ {
203
+ "epoch": 2.379693283976732,
204
+ "grad_norm": 17.67453956604004,
205
+ "learning_rate": 2.1743064352232075e-05,
206
+ "loss": 2.2848,
207
+ "step": 9000
208
+ },
209
+ {
210
+ "epoch": 2.379693283976732,
211
+ "eval_accuracy": 0.5754845372868036,
212
+ "eval_loss": 2.407994508743286,
213
+ "eval_runtime": 128.557,
214
+ "eval_samples_per_second": 14.313,
215
+ "eval_steps_per_second": 0.303,
216
+ "step": 9000
217
+ },
218
+ {
219
+ "epoch": 2.5118984664198836,
220
+ "grad_norm": 6.01920223236084,
221
+ "learning_rate": 1.7103059155426253e-05,
222
+ "loss": 2.2611,
223
+ "step": 9500
224
+ },
225
+ {
226
+ "epoch": 2.6441036488630356,
227
+ "grad_norm": 7.932415008544922,
228
+ "learning_rate": 1.2463053958620433e-05,
229
+ "loss": 2.2653,
230
+ "step": 10000
231
+ },
232
+ {
233
+ "epoch": 2.6441036488630356,
234
+ "eval_accuracy": 0.5784841549669718,
235
+ "eval_loss": 2.3834304809570312,
236
+ "eval_runtime": 128.7742,
237
+ "eval_samples_per_second": 14.289,
238
+ "eval_steps_per_second": 0.303,
239
+ "step": 10000
240
+ },
241
+ {
242
+ "epoch": 2.776308831306187,
243
+ "grad_norm": 9.934358596801758,
244
+ "learning_rate": 7.823048761814613e-06,
245
+ "loss": 2.2542,
246
+ "step": 10500
247
+ },
248
+ {
249
+ "epoch": 2.908514013749339,
250
+ "grad_norm": 8.533864974975586,
251
+ "learning_rate": 3.183043565008793e-06,
252
+ "loss": 2.2447,
253
+ "step": 11000
254
+ },
255
+ {
256
+ "epoch": 2.908514013749339,
257
+ "eval_accuracy": 0.5811017183152439,
258
+ "eval_loss": 2.3603451251983643,
259
+ "eval_runtime": 129.0604,
260
+ "eval_samples_per_second": 14.257,
261
+ "eval_steps_per_second": 0.302,
262
+ "step": 11000
263
+ },
264
  {
265
  "epoch": 3.0,
266
+ "step": 11346,
267
+ "total_flos": 1.4536404559724544e+17,
268
+ "train_loss": 2.5941595100713495,
269
+ "train_runtime": 20556.3593,
270
+ "train_samples_per_second": 13.243,
271
+ "train_steps_per_second": 0.552
272
  }
273
  ],
274
  "logging_steps": 500,
275
+ "max_steps": 11346,
276
  "num_input_tokens_seen": 0,
277
  "num_train_epochs": 3,
278
+ "save_steps": 1000,
279
  "stateful_callbacks": {
280
  "TrainerControl": {
281
  "args": {
282
  "should_epoch_stop": false,
283
  "should_evaluate": false,
284
  "should_log": false,
285
+ "should_save": true,
286
  "should_training_stop": false
287
  },
288
  "attributes": {}
289
  }
290
  },
291
+ "total_flos": 1.4536404559724544e+17,
292
+ "train_batch_size": 24,
293
  "trial_name": null,
294
  "trial_params": null
295
  }