w11wo commited on
Commit
2e61b14
1 Parent(s): bd215cf

config and tokenizer

Browse files
Files changed (4) hide show
  1. config.json +25 -0
  2. create_config.py +7 -0
  3. tokenizer.json +0 -0
  4. train_tokenizer.py +49 -0
config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "RobertaForMaskedLM"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "eos_token_id": 2,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 3072,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 514,
16
+ "model_type": "roberta",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 12,
19
+ "pad_token_id": 1,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.9.0.dev0",
22
+ "type_vocab_size": 1,
23
+ "use_cache": true,
24
+ "vocab_size": 50265
25
+ }
create_config.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from transformers import RobertaConfig
2
+
3
+ model_dir = "./"
4
+
5
+ config = RobertaConfig.from_pretrained("roberta-base")
6
+ config.save_pretrained(model_dir)
7
+
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
train_tokenizer.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, concatenate_datasets
2
+ from tokenizers import ByteLevelBPETokenizer
3
+ from pathlib import Path
4
+
5
+ dataset_language = "su"
6
+ validation_split_percentage = 10
7
+
8
+ # load dataset
9
+ # only the train subset for tokenizing purposes
10
+ oscar = load_dataset(
11
+ "oscar", f"unshuffled_deduplicated_{dataset_language}", split="train",
12
+ )
13
+
14
+ cc100 = load_dataset("cc100", lang=dataset_language, split="train")
15
+
16
+ mc4 = load_dataset("mc4", dataset_language, split="train")
17
+
18
+ wiki_files = [str(x) for x in Path("../docs").glob("*.txt")]
19
+ wiki = load_dataset("text", data_files=wiki_files)
20
+
21
+ # want: text column only!
22
+ oscar = oscar.remove_columns("id")
23
+ mc4 = mc4.remove_columns(["url", "timestamp"])
24
+ cc100 = cc100.remove_columns("id")
25
+
26
+ dataset = concatenate_datasets([oscar, mc4, cc100, wiki["train"]])
27
+ dataset = dataset.train_test_split(test_size=validation_split_percentage / 100, seed=42)
28
+
29
+ # Instantiate tokenizer
30
+ tokenizer = ByteLevelBPETokenizer()
31
+
32
+
33
+ def batch_iterator(batch_size=10000):
34
+ for i in range(0, len(dataset), batch_size):
35
+ yield dataset["train"][i : i + batch_size]["text"]
36
+
37
+
38
+ # Customized training
39
+ tokenizer.train_from_iterator(
40
+ batch_iterator(),
41
+ vocab_size=50265,
42
+ min_frequency=2,
43
+ special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>",],
44
+ )
45
+
46
+ # Save files to disk
47
+ model_dir = "."
48
+ tokenizer.save(f"{model_dir}/tokenizer.json")
49
+