MarkelFe commited on
Commit
9fe198a
1 Parent(s): cd3ca73

Training in progress, step 10000

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ checkpoint-*/
config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepESP/gpt2-spanish",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.28.1",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
last-checkpoint/config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DeepESP/gpt2-spanish",
3
+ "activation_function": "gelu_new",
4
+ "architectures": [
5
+ "GPT2LMHeadModel"
6
+ ],
7
+ "attn_pdrop": 0.1,
8
+ "bos_token_id": 50256,
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "gradient_checkpointing": false,
12
+ "initializer_range": 0.02,
13
+ "layer_norm_epsilon": 1e-05,
14
+ "model_type": "gpt2",
15
+ "n_ctx": 1024,
16
+ "n_embd": 768,
17
+ "n_head": 12,
18
+ "n_inner": null,
19
+ "n_layer": 12,
20
+ "n_positions": 1024,
21
+ "reorder_and_upcast_attn": false,
22
+ "resid_pdrop": 0.1,
23
+ "scale_attn_by_inverse_layer_idx": false,
24
+ "scale_attn_weights": true,
25
+ "summary_activation": null,
26
+ "summary_first_dropout": 0.1,
27
+ "summary_proj_to_labels": true,
28
+ "summary_type": "cls_index",
29
+ "summary_use_proj": true,
30
+ "task_specific_params": {
31
+ "text-generation": {
32
+ "do_sample": true,
33
+ "max_length": 50
34
+ }
35
+ },
36
+ "torch_dtype": "float32",
37
+ "transformers_version": "4.28.1",
38
+ "use_cache": true,
39
+ "vocab_size": 50257
40
+ }
last-checkpoint/generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "max_length": 1024,
3
+ "pad_token_id": 50256,
4
+ "transformers_version": "4.28.1"
5
+ }
last-checkpoint/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c79d4fa193949a1e8ccb93f801e7c758dda8058edccbb714f82055b7cb87d50a
3
+ size 995605189
last-checkpoint/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8918b8af28267ba18d33b060c30bc163ca65c26abd0ea3193b5918ef770c45d
3
+ size 510398013
last-checkpoint/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c520cf1bb57c2a4c42377e127df9db56f0427574ca822e986a5c3b83bd4c40b1
3
+ size 14575
last-checkpoint/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a609d7204d71efec22c7a71c8efd13a4bc386c7339091e854dae45c249c77f63
3
+ size 627
last-checkpoint/special_tokens_map.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|talk|>",
4
+ "<|ax1|>",
5
+ "<|ax2|>",
6
+ "<|ax3|>",
7
+ "<|ax4|>",
8
+ "<|ax5|>",
9
+ "<|ax6|>",
10
+ "<|ax7|>",
11
+ "<|ax8|>",
12
+ "<|ax9|>"
13
+ ],
14
+ "bos_token": "<|endoftext|>",
15
+ "eos_token": "<|endoftext|>",
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": "<|endoftext|>"
18
+ }
last-checkpoint/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
last-checkpoint/tokenizer_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "full_tokenizer_file": null,
23
+ "model_max_length": 1000000000000000019884624838656,
24
+ "pad_token": {
25
+ "__type": "AddedToken",
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": true,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "tokenizer_class": "GPT2Tokenizer",
33
+ "unk_token": {
34
+ "__type": "AddedToken",
35
+ "content": "<|endoftext|>",
36
+ "lstrip": false,
37
+ "normalized": true,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
last-checkpoint/trainer_state.json ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.23676484515579127,
5
+ "global_step": 10000,
6
+ "is_hyper_param_search": false,
7
+ "is_local_process_zero": true,
8
+ "is_world_process_zero": true,
9
+ "log_history": [
10
+ {
11
+ "epoch": 0.0,
12
+ "learning_rate": 3.913400890235818e-05,
13
+ "loss": 3.2516,
14
+ "step": 1
15
+ },
16
+ {
17
+ "epoch": 0.01,
18
+ "learning_rate": 3.907099788490072e-05,
19
+ "loss": 3.2746,
20
+ "step": 500
21
+ },
22
+ {
23
+ "epoch": 0.02,
24
+ "learning_rate": 3.900786059285918e-05,
25
+ "loss": 3.2966,
26
+ "step": 1000
27
+ },
28
+ {
29
+ "epoch": 0.04,
30
+ "learning_rate": 3.894472330081763e-05,
31
+ "loss": 3.2988,
32
+ "step": 1500
33
+ },
34
+ {
35
+ "epoch": 0.05,
36
+ "learning_rate": 3.888158600877609e-05,
37
+ "loss": 3.2782,
38
+ "step": 2000
39
+ },
40
+ {
41
+ "epoch": 0.06,
42
+ "learning_rate": 3.8818448716734545e-05,
43
+ "loss": 3.2631,
44
+ "step": 2500
45
+ },
46
+ {
47
+ "epoch": 0.07,
48
+ "learning_rate": 3.8755311424693e-05,
49
+ "loss": 3.2792,
50
+ "step": 3000
51
+ },
52
+ {
53
+ "epoch": 0.08,
54
+ "learning_rate": 3.8692174132651456e-05,
55
+ "loss": 3.2848,
56
+ "step": 3500
57
+ },
58
+ {
59
+ "epoch": 0.09,
60
+ "learning_rate": 3.862903684060991e-05,
61
+ "loss": 3.2699,
62
+ "step": 4000
63
+ },
64
+ {
65
+ "epoch": 0.11,
66
+ "learning_rate": 3.8565899548568366e-05,
67
+ "loss": 3.2634,
68
+ "step": 4500
69
+ },
70
+ {
71
+ "epoch": 0.12,
72
+ "learning_rate": 3.8502762256526824e-05,
73
+ "loss": 3.276,
74
+ "step": 5000
75
+ },
76
+ {
77
+ "epoch": 0.13,
78
+ "learning_rate": 3.8439624964485276e-05,
79
+ "loss": 3.2431,
80
+ "step": 5500
81
+ },
82
+ {
83
+ "epoch": 0.14,
84
+ "learning_rate": 3.8376487672443735e-05,
85
+ "loss": 3.261,
86
+ "step": 6000
87
+ },
88
+ {
89
+ "epoch": 0.15,
90
+ "learning_rate": 3.8313350380402186e-05,
91
+ "loss": 3.2711,
92
+ "step": 6500
93
+ },
94
+ {
95
+ "epoch": 0.17,
96
+ "learning_rate": 3.8250213088360645e-05,
97
+ "loss": 3.2567,
98
+ "step": 7000
99
+ },
100
+ {
101
+ "epoch": 0.18,
102
+ "learning_rate": 3.8187075796319097e-05,
103
+ "loss": 3.2721,
104
+ "step": 7500
105
+ },
106
+ {
107
+ "epoch": 0.19,
108
+ "learning_rate": 3.8123938504277555e-05,
109
+ "loss": 3.2305,
110
+ "step": 8000
111
+ },
112
+ {
113
+ "epoch": 0.2,
114
+ "learning_rate": 3.806080121223601e-05,
115
+ "loss": 3.2749,
116
+ "step": 8500
117
+ },
118
+ {
119
+ "epoch": 0.21,
120
+ "learning_rate": 3.7997663920194465e-05,
121
+ "loss": 3.245,
122
+ "step": 9000
123
+ },
124
+ {
125
+ "epoch": 0.22,
126
+ "learning_rate": 3.7934526628152924e-05,
127
+ "loss": 3.251,
128
+ "step": 9500
129
+ },
130
+ {
131
+ "epoch": 0.24,
132
+ "learning_rate": 3.7871389336111375e-05,
133
+ "loss": 3.2401,
134
+ "step": 10000
135
+ },
136
+ {
137
+ "epoch": 0.24,
138
+ "eval_loss": 3.248208999633789,
139
+ "eval_runtime": 113.4703,
140
+ "eval_samples_per_second": 165.435,
141
+ "eval_steps_per_second": 20.684,
142
+ "step": 10000
143
+ }
144
+ ],
145
+ "max_steps": 633540,
146
+ "num_train_epochs": 15,
147
+ "total_flos": 2276770162176000.0,
148
+ "trial_name": null,
149
+ "trial_params": null
150
+ }
last-checkpoint/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e13bd02ff5c40117ddbf942838af2297189fae003bd9b651003f2826488ee929
3
+ size 4987
last-checkpoint/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b8918b8af28267ba18d33b060c30bc163ca65c26abd0ea3193b5918ef770c45d
3
+ size 510398013
special_tokens_map.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|talk|>",
4
+ "<|ax1|>",
5
+ "<|ax2|>",
6
+ "<|ax3|>",
7
+ "<|ax4|>",
8
+ "<|ax5|>",
9
+ "<|ax6|>",
10
+ "<|ax7|>",
11
+ "<|ax8|>",
12
+ "<|ax9|>"
13
+ ],
14
+ "bos_token": "<|endoftext|>",
15
+ "eos_token": "<|endoftext|>",
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": "<|endoftext|>"
18
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "bos_token": {
5
+ "__type": "AddedToken",
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "clean_up_tokenization_spaces": true,
13
+ "eos_token": {
14
+ "__type": "AddedToken",
15
+ "content": "<|endoftext|>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false
20
+ },
21
+ "errors": "replace",
22
+ "full_tokenizer_file": null,
23
+ "model_max_length": 1000000000000000019884624838656,
24
+ "pad_token": {
25
+ "__type": "AddedToken",
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": true,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "tokenizer_class": "GPT2Tokenizer",
33
+ "unk_token": {
34
+ "__type": "AddedToken",
35
+ "content": "<|endoftext|>",
36
+ "lstrip": false,
37
+ "normalized": true,
38
+ "rstrip": false,
39
+ "single_word": false
40
+ }
41
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e13bd02ff5c40117ddbf942838af2297189fae003bd9b651003f2826488ee929
3
+ size 4987
vocab.json ADDED
The diff for this file is too large to render. See raw diff