Spaces:
Sleeping
Sleeping
File size: 3,381 Bytes
be80892 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 |
# -*- coding: utf-8 -*-
"""Fine Tuning Numer Two.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1iqPWMaXrktOsY2BwZNdQE8c1B4o1trit
"""
!pip install datasets
!pip install torch
!pip install -q -U transformers accelerate
!pip install transformers[torch]
!pip install accelerate -U
!pip install huggingface_hub
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset
# Load the dataset
dataset = load_dataset("ajibawa-2023/Children-Stories-Collection", trust_remote_code=True)
# Load the pre-trained model and tokenizer
tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
# Tokenize the dataset
def tokenize_function_one(examples):
return tokenizerOne(examples["text"], padding="max_length", truncation=True)
def tokenize_function_two(examples):
return tokenizerTwo(examples["text"], padding="max_length", truncation=True, max_length=512)
tokenizedDatasetOne = dataset.map(tokenize_function_one, batched=True)
shuffled_dataset = tokenizedDatasetOne['train'].shuffle(seed=42)
tokenized_datasets_oneTrain = shuffled_dataset.select(range(10000))
tokenized_datasets_oneTest = shuffled_dataset.select(range(10000, 12500))
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15)
training_args = TrainingArguments(
"test_trainer",
num_train_epochs=3,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
warmup_steps=500,
weight_decay=0.01,
)
# Model One: google-bert/bert-base-cased
model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased")
trainer_one = Trainer(
model=model_one,
args=training_args,
train_dataset=tokenized_datasets_oneTrain,
eval_dataset=tokenized_datasets_oneTest,
data_collator=data_collator,
)
trainer_one.train()
# Get your API token from HuggingFace.
api_token = "redacted"
from transformers import BertConfig, BertModel
model_one.push_to_hub("emma7897/bert_two", token = api_token)
tokenizerOne.push_to_hub("emma7897/bert_two", token = api_token)
tokenizedDatasetTwo = dataset.map(tokenize_function_two, batched=True)
shuffled_dataset = tokenizedDatasetTwo['train'].shuffle(seed=42)
tokenized_datasets_twoTrain = shuffled_dataset.select(range(10000))
tokenized_datasets_twoTest = shuffled_dataset.select(range(10000, 12500))
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15)
training_args = TrainingArguments(
"test_trainer",
num_train_epochs=3,
per_device_train_batch_size=32,
per_device_eval_batch_size=32,
warmup_steps=500,
weight_decay=0.01,
)
# Model Two: distilbert/distilbert-base-cased
model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased")
trainer_two = Trainer(
model=model_two,
args=training_args,
train_dataset=tokenized_datasets_twoTrain,
eval_dataset=tokenized_datasets_twoTest,
data_collator=data_collator,
)
trainer_two.train()
from transformers import DistilBertConfig, DistilBertModel
model_two.push_to_hub("emma7897/distilbert_two", token=api_token)
tokenizerTwo.push_to_hub("emma7897/distilbert_two", token=api_token) |