kazzand commited on
Commit
1ce35f8
1 Parent(s): fe86055

wiki checkpoint best

Browse files
config.json CHANGED
@@ -1,4 +1,5 @@
1
  {
 
2
  "architectures": [
3
  "LongformerForMaskedLM"
4
  ],
@@ -25,7 +26,7 @@
25
  "position_embedding_type": "absolute",
26
  "sep_token_id": 2,
27
  "torch_dtype": "float32",
28
- "transformers_version": "4.29.2",
29
  "type_vocab_size": 2,
30
  "vocab_size": 83830
31
  }
 
1
  {
2
+ "_name_or_path": "ru-longformer-tiny-16384-new_texts-20epoch-2",
3
  "architectures": [
4
  "LongformerForMaskedLM"
5
  ],
 
26
  "position_embedding_type": "absolute",
27
  "sep_token_id": 2,
28
  "torch_dtype": "float32",
29
+ "transformers_version": "4.31.0",
30
  "type_vocab_size": 2,
31
  "vocab_size": 83830
32
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ea56eb49abad0501a5f7fd33f2c6e822f025d65a891f62c7cb9583d7e451c0e
3
  size 138548629
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62971f83771ea2494667046f749fd57156e0c30ccc2cc333c06c7240b1a36a31
3
  size 138548629
special_tokens_map.json CHANGED
@@ -2,7 +2,13 @@
2
  "bos_token": "<s>",
3
  "cls_token": "[CLS]",
4
  "eos_token": "</s>",
5
- "mask_token": "[MASK]",
 
 
 
 
 
 
6
  "pad_token": "[PAD]",
7
  "sep_token": "[SEP]",
8
  "unk_token": "[UNK]"
 
2
  "bos_token": "<s>",
3
  "cls_token": "[CLS]",
4
  "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "[MASK]",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
  "pad_token": "[PAD]",
13
  "sep_token": "[SEP]",
14
  "unk_token": "[UNK]"
tokenizer.json CHANGED
@@ -2,12 +2,14 @@
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
- "max_length": 512,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
9
  "padding": {
10
- "strategy": "BatchLongest",
 
 
11
  "direction": "Right",
12
  "pad_to_multiple_of": null,
13
  "pad_id": 0,
@@ -55,7 +57,7 @@
55
  "id": 4,
56
  "content": "[MASK]",
57
  "single_word": false,
58
- "lstrip": false,
59
  "rstrip": false,
60
  "normalized": false,
61
  "special": true
 
2
  "version": "1.0",
3
  "truncation": {
4
  "direction": "Right",
5
+ "max_length": 16384,
6
  "strategy": "LongestFirst",
7
  "stride": 0
8
  },
9
  "padding": {
10
+ "strategy": {
11
+ "Fixed": 16384
12
+ },
13
  "direction": "Right",
14
  "pad_to_multiple_of": null,
15
  "pad_id": 0,
 
57
  "id": 4,
58
  "content": "[MASK]",
59
  "single_word": false,
60
+ "lstrip": true,
61
  "rstrip": false,
62
  "normalized": false,
63
  "special": true
tokenizer_config.json CHANGED
@@ -1,8 +1,12 @@
1
  {
 
 
2
  "clean_up_tokenization_spaces": true,
3
  "cls_token": "[CLS]",
4
  "do_basic_tokenize": true,
5
  "do_lower_case": false,
 
 
6
  "mask_token": "[MASK]",
7
  "model_max_length": 16384,
8
  "never_split": null,
@@ -10,6 +14,7 @@
10
  "sep_token": "[SEP]",
11
  "strip_accents": null,
12
  "tokenize_chinese_chars": true,
13
- "tokenizer_class": "BertTokenizer",
 
14
  "unk_token": "[UNK]"
15
  }
 
1
  {
2
+ "add_prefix_space": false,
3
+ "bos_token": "<s>",
4
  "clean_up_tokenization_spaces": true,
5
  "cls_token": "[CLS]",
6
  "do_basic_tokenize": true,
7
  "do_lower_case": false,
8
+ "eos_token": "</s>",
9
+ "errors": "replace",
10
  "mask_token": "[MASK]",
11
  "model_max_length": 16384,
12
  "never_split": null,
 
14
  "sep_token": "[SEP]",
15
  "strip_accents": null,
16
  "tokenize_chinese_chars": true,
17
+ "tokenizer_class": "LongformerTokenizer",
18
+ "trim_offsets": true,
19
  "unk_token": "[UNK]"
20
  }