Add model
Browse files- README.md +49 -0
- all_results.json +11 -0
- config.json +26 -0
- openvino_config.json +99 -0
- openvino_model.bin +3 -0
- openvino_model.xml +0 -0
- special_tokens_map.json +7 -0
- structured_sparsity.csv +73 -0
- tokenizer.json +0 -0
- tokenizer_config.json +14 -0
- training_args.bin +3 -0
- vocab.txt +0 -0
README.md
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
tags:
|
4 |
+
- generated_from_trainer
|
5 |
+
datasets:
|
6 |
+
- squad
|
7 |
+
model-index:
|
8 |
+
- name: bert-base-uncased-squad-v1-jpqd-ov-int8
|
9 |
+
results: []
|
10 |
+
---
|
11 |
+
|
12 |
+
<!-- This model card has been generated automatically according to the information the Trainer had access to. You
|
13 |
+
should probably proofread and complete it, then remove this comment. -->
|
14 |
+
|
15 |
+
# bert-base-uncased-squad-v1-jpqd-ov-int8
|
16 |
+
|
17 |
+
This model is a fine-tuned version of [bert-base-uncased](https://huggingface.co/bert-base-uncased) on the squad dataset.
|
18 |
+
It was compressed using [NNCF](https://github.com/openvinotoolkit/nncf) with [Optimum
|
19 |
+
Intel](https://github.com/huggingface/optimum-intel#openvino) following the [JPQD question-answering
|
20 |
+
example](https://github.com/huggingface/optimum-intel/tree/main/examples/openvino/question-answering#joint-pruning-quantization-and-distillation-jpqd-for-bert-on-squad10).
|
21 |
+
|
22 |
+
### Training hyperparameters
|
23 |
+
|
24 |
+
The following hyperparameters were used during training:
|
25 |
+
- learning_rate: 3e-05
|
26 |
+
- train_batch_size: 16
|
27 |
+
- eval_batch_size: 128
|
28 |
+
- seed: 42
|
29 |
+
- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
|
30 |
+
- lr_scheduler_type: linear
|
31 |
+
- num_epochs: 8.0
|
32 |
+
- mixed_precision_training: Native AMP
|
33 |
+
|
34 |
+
### Training results
|
35 |
+
|
36 |
+
```
|
37 |
+
***** eval metrics *****
|
38 |
+
epoch = 8.0
|
39 |
+
eval_exact_match = 83.141
|
40 |
+
eval_f1 = 89.5906
|
41 |
+
eval_samples = 10784
|
42 |
+
```
|
43 |
+
|
44 |
+
### Framework versions
|
45 |
+
|
46 |
+
- Transformers 4.26.1
|
47 |
+
- Pytorch 1.13.1+cu117
|
48 |
+
- Datasets 2.8.0
|
49 |
+
- Tokenizers 0.13.2
|
all_results.json
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"epoch": 8.0,
|
3 |
+
"eval_exact_match": 83.14096499526963,
|
4 |
+
"eval_f1": 89.59061048191492,
|
5 |
+
"eval_samples": 10784,
|
6 |
+
"train_loss": 2.368001897127539,
|
7 |
+
"train_runtime": 49077.3955,
|
8 |
+
"train_samples": 88524,
|
9 |
+
"train_samples_per_second": 14.43,
|
10 |
+
"train_steps_per_second": 0.902
|
11 |
+
}
|
config.json
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "bert-base-uncased",
|
3 |
+
"architectures": [
|
4 |
+
"NNCFNetwork"
|
5 |
+
],
|
6 |
+
"attention_probs_dropout_prob": 0.1,
|
7 |
+
"classifier_dropout": null,
|
8 |
+
"gradient_checkpointing": false,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_dropout_prob": 0.1,
|
11 |
+
"hidden_size": 768,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-12,
|
15 |
+
"max_position_embeddings": 512,
|
16 |
+
"model_type": "bert",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 0,
|
20 |
+
"position_embedding_type": "absolute",
|
21 |
+
"torch_dtype": "float32",
|
22 |
+
"transformers_version": "4.26.1",
|
23 |
+
"type_vocab_size": 2,
|
24 |
+
"use_cache": true,
|
25 |
+
"vocab_size": 30522
|
26 |
+
}
|
openvino_config.json
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"compression": [
|
3 |
+
{
|
4 |
+
"algorithm": "movement_sparsity",
|
5 |
+
"ignored_scopes": [
|
6 |
+
"{re}.*NNCFEmbedding.*",
|
7 |
+
"{re}.*qa_outputs.*",
|
8 |
+
"{re}.*LayerNorm.*"
|
9 |
+
],
|
10 |
+
"params": {
|
11 |
+
"enable_structured_masking": true,
|
12 |
+
"importance_regularization_factor": 0.02,
|
13 |
+
"warmup_end_epoch": 4,
|
14 |
+
"warmup_start_epoch": 1
|
15 |
+
},
|
16 |
+
"sparse_structure_by_scopes": [
|
17 |
+
{
|
18 |
+
"mode": "block",
|
19 |
+
"sparse_factors": [
|
20 |
+
32,
|
21 |
+
32
|
22 |
+
],
|
23 |
+
"target_scopes": "{re}.*BertAttention.*"
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"axis": 0,
|
27 |
+
"mode": "per_dim",
|
28 |
+
"target_scopes": "{re}.*BertIntermediate.*"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"axis": 1,
|
32 |
+
"mode": "per_dim",
|
33 |
+
"target_scopes": "{re}.*BertOutput.*"
|
34 |
+
}
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"algorithm": "quantization",
|
39 |
+
"export_to_onnx_standard_ops": false,
|
40 |
+
"ignored_scopes": [
|
41 |
+
"{re}.*__add___[0-1]",
|
42 |
+
"{re}.*layer_norm_0",
|
43 |
+
"{re}.*matmul_1",
|
44 |
+
"{re}.*__truediv__*"
|
45 |
+
],
|
46 |
+
"initializer": {
|
47 |
+
"batchnorm_adaptation": {
|
48 |
+
"num_bn_adaptation_samples": 200
|
49 |
+
},
|
50 |
+
"range": {
|
51 |
+
"num_init_samples": 32,
|
52 |
+
"params": {
|
53 |
+
"max_percentile": 99.99,
|
54 |
+
"min_percentile": 0.01
|
55 |
+
},
|
56 |
+
"type": "percentile"
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"overflow_fix": "disable",
|
60 |
+
"preset": "mixed",
|
61 |
+
"scope_overrides": {
|
62 |
+
"activations": {
|
63 |
+
"{re}.*matmul_0": {
|
64 |
+
"mode": "symmetric"
|
65 |
+
}
|
66 |
+
}
|
67 |
+
}
|
68 |
+
}
|
69 |
+
],
|
70 |
+
"input_info": [
|
71 |
+
{
|
72 |
+
"keyword": "input_ids",
|
73 |
+
"sample_size": [
|
74 |
+
16,
|
75 |
+
384
|
76 |
+
],
|
77 |
+
"type": "long"
|
78 |
+
},
|
79 |
+
{
|
80 |
+
"keyword": "token_type_ids",
|
81 |
+
"sample_size": [
|
82 |
+
16,
|
83 |
+
384
|
84 |
+
],
|
85 |
+
"type": "long"
|
86 |
+
},
|
87 |
+
{
|
88 |
+
"keyword": "attention_mask",
|
89 |
+
"sample_size": [
|
90 |
+
16,
|
91 |
+
384
|
92 |
+
],
|
93 |
+
"type": "long"
|
94 |
+
}
|
95 |
+
],
|
96 |
+
"optimum_version": "1.6.4",
|
97 |
+
"save_onnx_model": false,
|
98 |
+
"transformers_version": "4.26.1"
|
99 |
+
}
|
openvino_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81f4e1eba20221fd9fcc6bb8820422413f8413123d7418b0a303fc262449ce59
|
3 |
+
size 75477404
|
openvino_model.xml
ADDED
The diff for this file is too large to render.
See raw diff
|
|
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
structured_sparsity.csv
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,group_id,type,torch_module,weight_shape,pruned_weight_shape,bias_shape,pruned_bias_shape,head_or_channel_id_to_keep,module_node_name
|
2 |
+
0,0,MHSA,nncf_module.bert.encoder.layer.0.attention.self.query,"(768, 768)","(512, 768)","(768,)","(512,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
3 |
+
1,0,MHSA,nncf_module.bert.encoder.layer.0.attention.self.key,"(768, 768)","(512, 768)","(768,)","(512,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
4 |
+
2,0,MHSA,nncf_module.bert.encoder.layer.0.attention.self.value,"(768, 768)","(512, 768)","(768,)","(512,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
5 |
+
3,0,MHSA,nncf_module.bert.encoder.layer.0.attention.output.dense,"(768, 768)","(768, 512)","(768,)","(768,)","[0, 3, 4, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
6 |
+
4,1,FF,nncf_module.bert.encoder.layer.0.intermediate.dense,"(3072, 768)","(2066, 768)","(3072,)","(2066,)",[2066 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
7 |
+
5,1,FF,nncf_module.bert.encoder.layer.0.output.dense,"(768, 3072)","(768, 2066)","(768,)","(768,)",[2066 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[0]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
8 |
+
6,2,MHSA,nncf_module.bert.encoder.layer.1.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
9 |
+
7,2,MHSA,nncf_module.bert.encoder.layer.1.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
10 |
+
8,2,MHSA,nncf_module.bert.encoder.layer.1.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
11 |
+
9,2,MHSA,nncf_module.bert.encoder.layer.1.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[1, 4, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
12 |
+
10,3,FF,nncf_module.bert.encoder.layer.1.intermediate.dense,"(3072, 768)","(2067, 768)","(3072,)","(2067,)",[2067 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
13 |
+
11,3,FF,nncf_module.bert.encoder.layer.1.output.dense,"(768, 3072)","(768, 2067)","(768,)","(768,)",[2067 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[1]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
14 |
+
12,4,MHSA,nncf_module.bert.encoder.layer.2.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
15 |
+
13,4,MHSA,nncf_module.bert.encoder.layer.2.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
16 |
+
14,4,MHSA,nncf_module.bert.encoder.layer.2.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
17 |
+
15,4,MHSA,nncf_module.bert.encoder.layer.2.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[0, 1, 2, 3, 5, 6, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
18 |
+
16,5,FF,nncf_module.bert.encoder.layer.2.intermediate.dense,"(3072, 768)","(2082, 768)","(3072,)","(2082,)",[2082 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
19 |
+
17,5,FF,nncf_module.bert.encoder.layer.2.output.dense,"(768, 3072)","(768, 2082)","(768,)","(768,)",[2082 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[2]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
20 |
+
18,6,MHSA,nncf_module.bert.encoder.layer.3.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
21 |
+
19,6,MHSA,nncf_module.bert.encoder.layer.3.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
22 |
+
20,6,MHSA,nncf_module.bert.encoder.layer.3.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
23 |
+
21,6,MHSA,nncf_module.bert.encoder.layer.3.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[0, 1, 3, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
24 |
+
22,7,FF,nncf_module.bert.encoder.layer.3.intermediate.dense,"(3072, 768)","(2136, 768)","(3072,)","(2136,)",[2136 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
25 |
+
23,7,FF,nncf_module.bert.encoder.layer.3.output.dense,"(768, 3072)","(768, 2136)","(768,)","(768,)",[2136 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[3]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
26 |
+
24,8,MHSA,nncf_module.bert.encoder.layer.4.attention.self.query,"(768, 768)","(640, 768)","(768,)","(640,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
27 |
+
25,8,MHSA,nncf_module.bert.encoder.layer.4.attention.self.key,"(768, 768)","(640, 768)","(768,)","(640,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
28 |
+
26,8,MHSA,nncf_module.bert.encoder.layer.4.attention.self.value,"(768, 768)","(640, 768)","(768,)","(640,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
29 |
+
27,8,MHSA,nncf_module.bert.encoder.layer.4.attention.output.dense,"(768, 768)","(768, 640)","(768,)","(768,)","[0, 1, 3, 4, 5, 6, 7, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
30 |
+
28,9,FF,nncf_module.bert.encoder.layer.4.intermediate.dense,"(3072, 768)","(2023, 768)","(3072,)","(2023,)",[2023 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
31 |
+
29,9,FF,nncf_module.bert.encoder.layer.4.output.dense,"(768, 3072)","(768, 2023)","(768,)","(768,)",[2023 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[4]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
32 |
+
30,10,MHSA,nncf_module.bert.encoder.layer.5.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
33 |
+
31,10,MHSA,nncf_module.bert.encoder.layer.5.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
34 |
+
32,10,MHSA,nncf_module.bert.encoder.layer.5.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
35 |
+
33,10,MHSA,nncf_module.bert.encoder.layer.5.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[0, 3, 4, 5, 7, 8, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
36 |
+
34,11,FF,nncf_module.bert.encoder.layer.5.intermediate.dense,"(3072, 768)","(2011, 768)","(3072,)","(2011,)",[2011 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
37 |
+
35,11,FF,nncf_module.bert.encoder.layer.5.output.dense,"(768, 3072)","(768, 2011)","(768,)","(768,)",[2011 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[5]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
38 |
+
36,12,MHSA,nncf_module.bert.encoder.layer.6.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
39 |
+
37,12,MHSA,nncf_module.bert.encoder.layer.6.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
40 |
+
38,12,MHSA,nncf_module.bert.encoder.layer.6.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
41 |
+
39,12,MHSA,nncf_module.bert.encoder.layer.6.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[1, 4, 5, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
42 |
+
40,13,FF,nncf_module.bert.encoder.layer.6.intermediate.dense,"(3072, 768)","(1871, 768)","(3072,)","(1871,)",[1871 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
43 |
+
41,13,FF,nncf_module.bert.encoder.layer.6.output.dense,"(768, 3072)","(768, 1871)","(768,)","(768,)",[1871 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[6]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
44 |
+
42,14,MHSA,nncf_module.bert.encoder.layer.7.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
45 |
+
43,14,MHSA,nncf_module.bert.encoder.layer.7.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
46 |
+
44,14,MHSA,nncf_module.bert.encoder.layer.7.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
47 |
+
45,14,MHSA,nncf_module.bert.encoder.layer.7.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[0, 2, 4, 5, 8, 9, 10]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
48 |
+
46,15,FF,nncf_module.bert.encoder.layer.7.intermediate.dense,"(3072, 768)","(1858, 768)","(3072,)","(1858,)",[1858 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
49 |
+
47,15,FF,nncf_module.bert.encoder.layer.7.output.dense,"(768, 3072)","(768, 1858)","(768,)","(768,)",[1858 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[7]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
50 |
+
48,16,MHSA,nncf_module.bert.encoder.layer.8.attention.self.query,"(768, 768)","(576, 768)","(768,)","(576,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
51 |
+
49,16,MHSA,nncf_module.bert.encoder.layer.8.attention.self.key,"(768, 768)","(576, 768)","(768,)","(576,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
52 |
+
50,16,MHSA,nncf_module.bert.encoder.layer.8.attention.self.value,"(768, 768)","(576, 768)","(768,)","(576,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
53 |
+
51,16,MHSA,nncf_module.bert.encoder.layer.8.attention.output.dense,"(768, 768)","(768, 576)","(768,)","(768,)","[1, 2, 3, 5, 6, 7, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
54 |
+
52,17,FF,nncf_module.bert.encoder.layer.8.intermediate.dense,"(3072, 768)","(1637, 768)","(3072,)","(1637,)",[1637 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
55 |
+
53,17,FF,nncf_module.bert.encoder.layer.8.output.dense,"(768, 3072)","(768, 1637)","(768,)","(768,)",[1637 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[8]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
56 |
+
54,18,MHSA,nncf_module.bert.encoder.layer.9.attention.self.query,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
57 |
+
55,18,MHSA,nncf_module.bert.encoder.layer.9.attention.self.key,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
58 |
+
56,18,MHSA,nncf_module.bert.encoder.layer.9.attention.self.value,"(768, 768)","(448, 768)","(768,)","(448,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
59 |
+
57,18,MHSA,nncf_module.bert.encoder.layer.9.attention.output.dense,"(768, 768)","(768, 448)","(768,)","(768,)","[0, 2, 3, 6, 8, 9, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
60 |
+
58,19,FF,nncf_module.bert.encoder.layer.9.intermediate.dense,"(3072, 768)","(1257, 768)","(3072,)","(1257,)",[1257 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
61 |
+
59,19,FF,nncf_module.bert.encoder.layer.9.output.dense,"(768, 3072)","(768, 1257)","(768,)","(768,)",[1257 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[9]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
62 |
+
60,20,MHSA,nncf_module.bert.encoder.layer.10.attention.self.query,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
63 |
+
61,20,MHSA,nncf_module.bert.encoder.layer.10.attention.self.key,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
64 |
+
62,20,MHSA,nncf_module.bert.encoder.layer.10.attention.self.value,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
65 |
+
63,20,MHSA,nncf_module.bert.encoder.layer.10.attention.output.dense,"(768, 768)","(768, 384)","(768,)","(768,)","[0, 2, 3, 9, 10, 11]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
66 |
+
64,21,FF,nncf_module.bert.encoder.layer.10.intermediate.dense,"(3072, 768)","(1159, 768)","(3072,)","(1159,)",[1159 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
67 |
+
65,21,FF,nncf_module.bert.encoder.layer.10.output.dense,"(768, 3072)","(768, 1159)","(768,)","(768,)",[1159 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[10]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
68 |
+
66,22,MHSA,nncf_module.bert.encoder.layer.11.attention.self.query,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[query]/linear_0
|
69 |
+
67,22,MHSA,nncf_module.bert.encoder.layer.11.attention.self.key,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[key]/linear_0
|
70 |
+
68,22,MHSA,nncf_module.bert.encoder.layer.11.attention.self.value,"(768, 768)","(384, 768)","(768,)","(384,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfAttention[self]/NNCFLinear[value]/linear_0
|
71 |
+
69,22,MHSA,nncf_module.bert.encoder.layer.11.attention.output.dense,"(768, 768)","(768, 384)","(768,)","(768,)","[0, 1, 2, 4, 6, 9]",BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertAttention[attention]/BertSelfOutput[output]/NNCFLinear[dense]/linear_0
|
72 |
+
70,23,FF,nncf_module.bert.encoder.layer.11.intermediate.dense,"(3072, 768)","(1017, 768)","(3072,)","(1017,)",[1017 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertIntermediate[intermediate]/NNCFLinear[dense]/linear_0
|
73 |
+
71,23,FF,nncf_module.bert.encoder.layer.11.output.dense,"(768, 3072)","(768, 1017)","(768,)","(768,)",[1017 items],BertForQuestionAnswering/BertModel[bert]/BertEncoder[encoder]/ModuleList[layer]/BertLayer[11]/BertOutput[output]/NNCFLinear[dense]/linear_0
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_lower_case": true,
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"model_max_length": 512,
|
6 |
+
"name_or_path": "bert-base-uncased",
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"special_tokens_map_file": null,
|
10 |
+
"strip_accents": null,
|
11 |
+
"tokenize_chinese_chars": true,
|
12 |
+
"tokenizer_class": "BertTokenizer",
|
13 |
+
"unk_token": "[UNK]"
|
14 |
+
}
|
training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d5552aa8097d9d47b0343cdacd36e5bc50157c344cb9488c095d5eb9a1bc83f5
|
3 |
+
size 3579
|
vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|