Add files
Browse files- README.md +0 -0
- all_results.json +159 -0
- config.json +50 -0
- pytorch_model.bin +3 -0
- special_tokens_map.json +5 -0
- tokenizer.json +0 -0
- tokenizer_config.json +9 -0
- train_results.json +159 -0
- trainer_state.json +0 -0
- training_args.bin +3 -0
README.md
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
all_results.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"MSE": 0.0,
|
3 |
+
"MSE/layer0": 0.0,
|
4 |
+
"MSE/layer1": 0.0,
|
5 |
+
"MSE/layer10": 0.0,
|
6 |
+
"MSE/layer11": 0.0,
|
7 |
+
"MSE/layer12": 0.0,
|
8 |
+
"MSE/layer13": 0.0,
|
9 |
+
"MSE/layer14": 0.0,
|
10 |
+
"MSE/layer15": 0.0,
|
11 |
+
"MSE/layer16": 0.0,
|
12 |
+
"MSE/layer17": 0.0,
|
13 |
+
"MSE/layer18": 0.0,
|
14 |
+
"MSE/layer19": 0.0,
|
15 |
+
"MSE/layer2": 0.0,
|
16 |
+
"MSE/layer20": 0.0,
|
17 |
+
"MSE/layer21": 0.0,
|
18 |
+
"MSE/layer22": 0.0,
|
19 |
+
"MSE/layer23": 0.0,
|
20 |
+
"MSE/layer3": 0.0,
|
21 |
+
"MSE/layer4": 0.0,
|
22 |
+
"MSE/layer5": 0.0,
|
23 |
+
"MSE/layer6": 0.0,
|
24 |
+
"MSE/layer7": 0.0,
|
25 |
+
"MSE/layer8": 0.0,
|
26 |
+
"MSE/layer9": 0.0,
|
27 |
+
"dead_code_fraction": 1.0,
|
28 |
+
"dead_code_fraction/layer0": 1.0,
|
29 |
+
"dead_code_fraction/layer1": 1.0,
|
30 |
+
"dead_code_fraction/layer10": 1.0,
|
31 |
+
"dead_code_fraction/layer11": 1.0,
|
32 |
+
"dead_code_fraction/layer12": 1.0,
|
33 |
+
"dead_code_fraction/layer13": 1.0,
|
34 |
+
"dead_code_fraction/layer14": 1.0,
|
35 |
+
"dead_code_fraction/layer15": 1.0,
|
36 |
+
"dead_code_fraction/layer16": 1.0,
|
37 |
+
"dead_code_fraction/layer17": 1.0,
|
38 |
+
"dead_code_fraction/layer18": 1.0,
|
39 |
+
"dead_code_fraction/layer19": 1.0,
|
40 |
+
"dead_code_fraction/layer2": 1.0,
|
41 |
+
"dead_code_fraction/layer20": 1.0,
|
42 |
+
"dead_code_fraction/layer21": 1.0,
|
43 |
+
"dead_code_fraction/layer22": 1.0,
|
44 |
+
"dead_code_fraction/layer23": 1.0,
|
45 |
+
"dead_code_fraction/layer3": 1.0,
|
46 |
+
"dead_code_fraction/layer4": 1.0,
|
47 |
+
"dead_code_fraction/layer5": 1.0,
|
48 |
+
"dead_code_fraction/layer6": 1.0,
|
49 |
+
"dead_code_fraction/layer7": 1.0,
|
50 |
+
"dead_code_fraction/layer8": 1.0,
|
51 |
+
"dead_code_fraction/layer9": 1.0,
|
52 |
+
"epoch": 6.26,
|
53 |
+
"input_norm": 0.0,
|
54 |
+
"input_norm/layer0": 0.0,
|
55 |
+
"input_norm/layer1": 0.0,
|
56 |
+
"input_norm/layer10": 0.0,
|
57 |
+
"input_norm/layer11": 0.0,
|
58 |
+
"input_norm/layer12": 0.0,
|
59 |
+
"input_norm/layer13": 0.0,
|
60 |
+
"input_norm/layer14": 0.0,
|
61 |
+
"input_norm/layer15": 0.0,
|
62 |
+
"input_norm/layer16": 0.0,
|
63 |
+
"input_norm/layer17": 0.0,
|
64 |
+
"input_norm/layer18": 0.0,
|
65 |
+
"input_norm/layer19": 0.0,
|
66 |
+
"input_norm/layer2": 0.0,
|
67 |
+
"input_norm/layer20": 0.0,
|
68 |
+
"input_norm/layer21": 0.0,
|
69 |
+
"input_norm/layer22": 0.0,
|
70 |
+
"input_norm/layer23": 0.0,
|
71 |
+
"input_norm/layer3": 0.0,
|
72 |
+
"input_norm/layer4": 0.0,
|
73 |
+
"input_norm/layer5": 0.0,
|
74 |
+
"input_norm/layer6": 0.0,
|
75 |
+
"input_norm/layer7": 0.0,
|
76 |
+
"input_norm/layer8": 0.0,
|
77 |
+
"input_norm/layer9": 0.0,
|
78 |
+
"max_norm": 45.539119720458984,
|
79 |
+
"max_norm/layer0": 34.44173049926758,
|
80 |
+
"max_norm/layer1": 36.61558151245117,
|
81 |
+
"max_norm/layer10": 38.54380416870117,
|
82 |
+
"max_norm/layer11": 34.865203857421875,
|
83 |
+
"max_norm/layer12": 40.908504486083984,
|
84 |
+
"max_norm/layer13": 35.78108215332031,
|
85 |
+
"max_norm/layer14": 36.67228317260742,
|
86 |
+
"max_norm/layer15": 45.083438873291016,
|
87 |
+
"max_norm/layer16": 36.927913665771484,
|
88 |
+
"max_norm/layer17": 45.539119720458984,
|
89 |
+
"max_norm/layer18": 39.2352409362793,
|
90 |
+
"max_norm/layer19": 38.779598236083984,
|
91 |
+
"max_norm/layer2": 26.836795806884766,
|
92 |
+
"max_norm/layer20": 38.50577163696289,
|
93 |
+
"max_norm/layer21": 38.87571334838867,
|
94 |
+
"max_norm/layer22": 39.42427062988281,
|
95 |
+
"max_norm/layer23": 37.21847915649414,
|
96 |
+
"max_norm/layer3": 34.34575271606445,
|
97 |
+
"max_norm/layer4": 34.4432258605957,
|
98 |
+
"max_norm/layer5": 44.077754974365234,
|
99 |
+
"max_norm/layer6": 28.6057071685791,
|
100 |
+
"max_norm/layer7": 37.91745376586914,
|
101 |
+
"max_norm/layer8": 36.69032287597656,
|
102 |
+
"max_norm/layer9": 37.08796691894531,
|
103 |
+
"mean_norm": 11.799732064207396,
|
104 |
+
"mean_norm/layer0": 11.755437850952148,
|
105 |
+
"mean_norm/layer1": 11.22901839017868,
|
106 |
+
"mean_norm/layer10": 11.532833635807037,
|
107 |
+
"mean_norm/layer11": 11.962444841861725,
|
108 |
+
"mean_norm/layer12": 12.79077160358429,
|
109 |
+
"mean_norm/layer13": 11.57960969209671,
|
110 |
+
"mean_norm/layer14": 12.059264957904816,
|
111 |
+
"mean_norm/layer15": 12.540440499782562,
|
112 |
+
"mean_norm/layer16": 11.641206741333008,
|
113 |
+
"mean_norm/layer17": 12.231300234794617,
|
114 |
+
"mean_norm/layer18": 11.600049555301666,
|
115 |
+
"mean_norm/layer19": 11.686796128749847,
|
116 |
+
"mean_norm/layer2": 9.256644666194916,
|
117 |
+
"mean_norm/layer20": 11.78922188282013,
|
118 |
+
"mean_norm/layer21": 11.759462356567383,
|
119 |
+
"mean_norm/layer22": 13.063357532024384,
|
120 |
+
"mean_norm/layer23": 13.022553265094757,
|
121 |
+
"mean_norm/layer3": 12.574194192886353,
|
122 |
+
"mean_norm/layer4": 10.863756775856018,
|
123 |
+
"mean_norm/layer5": 14.197384178638458,
|
124 |
+
"mean_norm/layer6": 10.185243308544159,
|
125 |
+
"mean_norm/layer7": 10.893572747707367,
|
126 |
+
"mean_norm/layer8": 11.53871750831604,
|
127 |
+
"mean_norm/layer9": 11.440286993980408,
|
128 |
+
"multicode_k": 8,
|
129 |
+
"output_norm": 0.0,
|
130 |
+
"output_norm/layer0": 0.0,
|
131 |
+
"output_norm/layer1": 0.0,
|
132 |
+
"output_norm/layer10": 0.0,
|
133 |
+
"output_norm/layer11": 0.0,
|
134 |
+
"output_norm/layer12": 0.0,
|
135 |
+
"output_norm/layer13": 0.0,
|
136 |
+
"output_norm/layer14": 0.0,
|
137 |
+
"output_norm/layer15": 0.0,
|
138 |
+
"output_norm/layer16": 0.0,
|
139 |
+
"output_norm/layer17": 0.0,
|
140 |
+
"output_norm/layer18": 0.0,
|
141 |
+
"output_norm/layer19": 0.0,
|
142 |
+
"output_norm/layer2": 0.0,
|
143 |
+
"output_norm/layer20": 0.0,
|
144 |
+
"output_norm/layer21": 0.0,
|
145 |
+
"output_norm/layer22": 0.0,
|
146 |
+
"output_norm/layer23": 0.0,
|
147 |
+
"output_norm/layer3": 0.0,
|
148 |
+
"output_norm/layer4": 0.0,
|
149 |
+
"output_norm/layer5": 0.0,
|
150 |
+
"output_norm/layer6": 0.0,
|
151 |
+
"output_norm/layer7": 0.0,
|
152 |
+
"output_norm/layer8": 0.0,
|
153 |
+
"output_norm/layer9": 0.0,
|
154 |
+
"train_loss": 2.685329116312663,
|
155 |
+
"train_runtime": 43939.9354,
|
156 |
+
"train_samples": 114937,
|
157 |
+
"train_samples_per_second": 16.386,
|
158 |
+
"train_steps_per_second": 0.341
|
159 |
+
}
|
config.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"GPTNeoXCodebookModel"
|
4 |
+
],
|
5 |
+
"codebook_at": [
|
6 |
+
"attn_preproj"
|
7 |
+
],
|
8 |
+
"codebook_type": "group",
|
9 |
+
"k_codebook": 8,
|
10 |
+
"kmeans_init": false,
|
11 |
+
"kmeans_init_examples": 1000,
|
12 |
+
"kmeans_kwargs": {
|
13 |
+
"batch_size": 24576,
|
14 |
+
"n_init": "auto"
|
15 |
+
},
|
16 |
+
"kmeans_path": "/.cache/cb_volume/huggingface/kmeans_embeddings.pt",
|
17 |
+
"layers_to_snap": [
|
18 |
+
0,
|
19 |
+
1,
|
20 |
+
2,
|
21 |
+
3,
|
22 |
+
4,
|
23 |
+
5,
|
24 |
+
6,
|
25 |
+
7,
|
26 |
+
8,
|
27 |
+
9,
|
28 |
+
10,
|
29 |
+
11,
|
30 |
+
12,
|
31 |
+
13,
|
32 |
+
14,
|
33 |
+
15,
|
34 |
+
16,
|
35 |
+
17,
|
36 |
+
18,
|
37 |
+
19,
|
38 |
+
20,
|
39 |
+
21,
|
40 |
+
22,
|
41 |
+
23
|
42 |
+
],
|
43 |
+
"loss": "aeloss",
|
44 |
+
"model_type": "codebook",
|
45 |
+
"num_codebooks": 16,
|
46 |
+
"num_codes": 10000,
|
47 |
+
"similarity_metric": "inner_product",
|
48 |
+
"torch_dtype": "float32",
|
49 |
+
"transformers_version": "4.27.3"
|
50 |
+
}
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:047349fb750aba188ba5b471a0c8e518b74984623d17524d02475169d29d4a3d
|
3 |
+
size 2705783745
|
special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": "<|endoftext|>",
|
3 |
+
"eos_token": "<|endoftext|>",
|
4 |
+
"unk_token": "<|endoftext|>"
|
5 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"bos_token": "<|endoftext|>",
|
4 |
+
"eos_token": "<|endoftext|>",
|
5 |
+
"model_max_length": 1000000000000000019884624838656,
|
6 |
+
"special_tokens_map_file": "/admin/home-hailey/.cache/huggingface/hub/models--EleutherAI--gpt-neox-20b/snapshots/4e49eadb5d14bd22f314ec3f45b69a87b88c7691/special_tokens_map.json",
|
7 |
+
"tokenizer_class": "GPTNeoXTokenizer",
|
8 |
+
"unk_token": "<|endoftext|>"
|
9 |
+
}
|
train_results.json
ADDED
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"MSE": 0.0,
|
3 |
+
"MSE/layer0": 0.0,
|
4 |
+
"MSE/layer1": 0.0,
|
5 |
+
"MSE/layer10": 0.0,
|
6 |
+
"MSE/layer11": 0.0,
|
7 |
+
"MSE/layer12": 0.0,
|
8 |
+
"MSE/layer13": 0.0,
|
9 |
+
"MSE/layer14": 0.0,
|
10 |
+
"MSE/layer15": 0.0,
|
11 |
+
"MSE/layer16": 0.0,
|
12 |
+
"MSE/layer17": 0.0,
|
13 |
+
"MSE/layer18": 0.0,
|
14 |
+
"MSE/layer19": 0.0,
|
15 |
+
"MSE/layer2": 0.0,
|
16 |
+
"MSE/layer20": 0.0,
|
17 |
+
"MSE/layer21": 0.0,
|
18 |
+
"MSE/layer22": 0.0,
|
19 |
+
"MSE/layer23": 0.0,
|
20 |
+
"MSE/layer3": 0.0,
|
21 |
+
"MSE/layer4": 0.0,
|
22 |
+
"MSE/layer5": 0.0,
|
23 |
+
"MSE/layer6": 0.0,
|
24 |
+
"MSE/layer7": 0.0,
|
25 |
+
"MSE/layer8": 0.0,
|
26 |
+
"MSE/layer9": 0.0,
|
27 |
+
"dead_code_fraction": 1.0,
|
28 |
+
"dead_code_fraction/layer0": 1.0,
|
29 |
+
"dead_code_fraction/layer1": 1.0,
|
30 |
+
"dead_code_fraction/layer10": 1.0,
|
31 |
+
"dead_code_fraction/layer11": 1.0,
|
32 |
+
"dead_code_fraction/layer12": 1.0,
|
33 |
+
"dead_code_fraction/layer13": 1.0,
|
34 |
+
"dead_code_fraction/layer14": 1.0,
|
35 |
+
"dead_code_fraction/layer15": 1.0,
|
36 |
+
"dead_code_fraction/layer16": 1.0,
|
37 |
+
"dead_code_fraction/layer17": 1.0,
|
38 |
+
"dead_code_fraction/layer18": 1.0,
|
39 |
+
"dead_code_fraction/layer19": 1.0,
|
40 |
+
"dead_code_fraction/layer2": 1.0,
|
41 |
+
"dead_code_fraction/layer20": 1.0,
|
42 |
+
"dead_code_fraction/layer21": 1.0,
|
43 |
+
"dead_code_fraction/layer22": 1.0,
|
44 |
+
"dead_code_fraction/layer23": 1.0,
|
45 |
+
"dead_code_fraction/layer3": 1.0,
|
46 |
+
"dead_code_fraction/layer4": 1.0,
|
47 |
+
"dead_code_fraction/layer5": 1.0,
|
48 |
+
"dead_code_fraction/layer6": 1.0,
|
49 |
+
"dead_code_fraction/layer7": 1.0,
|
50 |
+
"dead_code_fraction/layer8": 1.0,
|
51 |
+
"dead_code_fraction/layer9": 1.0,
|
52 |
+
"epoch": 6.26,
|
53 |
+
"input_norm": 0.0,
|
54 |
+
"input_norm/layer0": 0.0,
|
55 |
+
"input_norm/layer1": 0.0,
|
56 |
+
"input_norm/layer10": 0.0,
|
57 |
+
"input_norm/layer11": 0.0,
|
58 |
+
"input_norm/layer12": 0.0,
|
59 |
+
"input_norm/layer13": 0.0,
|
60 |
+
"input_norm/layer14": 0.0,
|
61 |
+
"input_norm/layer15": 0.0,
|
62 |
+
"input_norm/layer16": 0.0,
|
63 |
+
"input_norm/layer17": 0.0,
|
64 |
+
"input_norm/layer18": 0.0,
|
65 |
+
"input_norm/layer19": 0.0,
|
66 |
+
"input_norm/layer2": 0.0,
|
67 |
+
"input_norm/layer20": 0.0,
|
68 |
+
"input_norm/layer21": 0.0,
|
69 |
+
"input_norm/layer22": 0.0,
|
70 |
+
"input_norm/layer23": 0.0,
|
71 |
+
"input_norm/layer3": 0.0,
|
72 |
+
"input_norm/layer4": 0.0,
|
73 |
+
"input_norm/layer5": 0.0,
|
74 |
+
"input_norm/layer6": 0.0,
|
75 |
+
"input_norm/layer7": 0.0,
|
76 |
+
"input_norm/layer8": 0.0,
|
77 |
+
"input_norm/layer9": 0.0,
|
78 |
+
"max_norm": 45.539119720458984,
|
79 |
+
"max_norm/layer0": 34.44173049926758,
|
80 |
+
"max_norm/layer1": 36.61558151245117,
|
81 |
+
"max_norm/layer10": 38.54380416870117,
|
82 |
+
"max_norm/layer11": 34.865203857421875,
|
83 |
+
"max_norm/layer12": 40.908504486083984,
|
84 |
+
"max_norm/layer13": 35.78108215332031,
|
85 |
+
"max_norm/layer14": 36.67228317260742,
|
86 |
+
"max_norm/layer15": 45.083438873291016,
|
87 |
+
"max_norm/layer16": 36.927913665771484,
|
88 |
+
"max_norm/layer17": 45.539119720458984,
|
89 |
+
"max_norm/layer18": 39.2352409362793,
|
90 |
+
"max_norm/layer19": 38.779598236083984,
|
91 |
+
"max_norm/layer2": 26.836795806884766,
|
92 |
+
"max_norm/layer20": 38.50577163696289,
|
93 |
+
"max_norm/layer21": 38.87571334838867,
|
94 |
+
"max_norm/layer22": 39.42427062988281,
|
95 |
+
"max_norm/layer23": 37.21847915649414,
|
96 |
+
"max_norm/layer3": 34.34575271606445,
|
97 |
+
"max_norm/layer4": 34.4432258605957,
|
98 |
+
"max_norm/layer5": 44.077754974365234,
|
99 |
+
"max_norm/layer6": 28.6057071685791,
|
100 |
+
"max_norm/layer7": 37.91745376586914,
|
101 |
+
"max_norm/layer8": 36.69032287597656,
|
102 |
+
"max_norm/layer9": 37.08796691894531,
|
103 |
+
"mean_norm": 11.799732064207396,
|
104 |
+
"mean_norm/layer0": 11.755437850952148,
|
105 |
+
"mean_norm/layer1": 11.22901839017868,
|
106 |
+
"mean_norm/layer10": 11.532833635807037,
|
107 |
+
"mean_norm/layer11": 11.962444841861725,
|
108 |
+
"mean_norm/layer12": 12.79077160358429,
|
109 |
+
"mean_norm/layer13": 11.57960969209671,
|
110 |
+
"mean_norm/layer14": 12.059264957904816,
|
111 |
+
"mean_norm/layer15": 12.540440499782562,
|
112 |
+
"mean_norm/layer16": 11.641206741333008,
|
113 |
+
"mean_norm/layer17": 12.231300234794617,
|
114 |
+
"mean_norm/layer18": 11.600049555301666,
|
115 |
+
"mean_norm/layer19": 11.686796128749847,
|
116 |
+
"mean_norm/layer2": 9.256644666194916,
|
117 |
+
"mean_norm/layer20": 11.78922188282013,
|
118 |
+
"mean_norm/layer21": 11.759462356567383,
|
119 |
+
"mean_norm/layer22": 13.063357532024384,
|
120 |
+
"mean_norm/layer23": 13.022553265094757,
|
121 |
+
"mean_norm/layer3": 12.574194192886353,
|
122 |
+
"mean_norm/layer4": 10.863756775856018,
|
123 |
+
"mean_norm/layer5": 14.197384178638458,
|
124 |
+
"mean_norm/layer6": 10.185243308544159,
|
125 |
+
"mean_norm/layer7": 10.893572747707367,
|
126 |
+
"mean_norm/layer8": 11.53871750831604,
|
127 |
+
"mean_norm/layer9": 11.440286993980408,
|
128 |
+
"multicode_k": 8,
|
129 |
+
"output_norm": 0.0,
|
130 |
+
"output_norm/layer0": 0.0,
|
131 |
+
"output_norm/layer1": 0.0,
|
132 |
+
"output_norm/layer10": 0.0,
|
133 |
+
"output_norm/layer11": 0.0,
|
134 |
+
"output_norm/layer12": 0.0,
|
135 |
+
"output_norm/layer13": 0.0,
|
136 |
+
"output_norm/layer14": 0.0,
|
137 |
+
"output_norm/layer15": 0.0,
|
138 |
+
"output_norm/layer16": 0.0,
|
139 |
+
"output_norm/layer17": 0.0,
|
140 |
+
"output_norm/layer18": 0.0,
|
141 |
+
"output_norm/layer19": 0.0,
|
142 |
+
"output_norm/layer2": 0.0,
|
143 |
+
"output_norm/layer20": 0.0,
|
144 |
+
"output_norm/layer21": 0.0,
|
145 |
+
"output_norm/layer22": 0.0,
|
146 |
+
"output_norm/layer23": 0.0,
|
147 |
+
"output_norm/layer3": 0.0,
|
148 |
+
"output_norm/layer4": 0.0,
|
149 |
+
"output_norm/layer5": 0.0,
|
150 |
+
"output_norm/layer6": 0.0,
|
151 |
+
"output_norm/layer7": 0.0,
|
152 |
+
"output_norm/layer8": 0.0,
|
153 |
+
"output_norm/layer9": 0.0,
|
154 |
+
"train_loss": 2.685329116312663,
|
155 |
+
"train_runtime": 43939.9354,
|
156 |
+
"train_samples": 114937,
|
157 |
+
"train_samples_per_second": 16.386,
|
158 |
+
"train_steps_per_second": 0.341
|
159 |
+
}
|
trainer_state.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:974e67b66201b847274c72f8bccd37bc28a91bd779b977a46504b96111e57b61
|
3 |
+
size 3771
|