Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""Fine Tuning Number One.ipynb | |
Automatically generated by Colab. | |
Original file is located at | |
https://colab.research.google.com/drive/1ICULTdmxijXHisMebXX5KmPzxzfZ2TtH | |
""" | |
!pip install datasets | |
!pip install torch | |
!pip install -q -U transformers accelerate | |
!pip install transformers[torch] | |
!pip install accelerate -U | |
!pip install huggingface_hub | |
from transformers import AutoTokenizer, AutoModelForMaskedLM, DataCollatorForLanguageModeling, TrainingArguments, Trainer | |
from datasets import load_dataset | |
# Load the dataset | |
datasetTrain = load_dataset("rcds/wikipedia-for-mask-filling", "original_512", trust_remote_code=True) | |
datasetTest = load_dataset("rcds/wikipedia-for-mask-filling", "original_4096", trust_remote_code=True) | |
# Load the pre-trained model and tokenizer | |
tokenizerOne = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") | |
tokenizerTwo = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased") | |
# Tokenize the dataset | |
def tokenize_function_one(examples): | |
return tokenizerOne(examples["texts"], padding="max_length", truncation=True) | |
def tokenize_function_two(examples): | |
return tokenizerTwo(examples["texts"], padding="max_length", truncation=True, max_length=512) | |
# Make the datasets | |
tokenized_datasets_oneTrain = datasetTrain.map(tokenize_function_one, batched=True) | |
tokenized_datasets_oneTest = datasetTest.map(tokenize_function_one, batched=True) | |
tokenized_datasets_oneTrain = tokenized_datasets_oneTrain["train"].select(range(10000)) | |
tokenized_datasets_oneTest = tokenized_datasets_oneTest["train"].select(range(2500)) | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerOne, mlm_probability=0.15) | |
training_args = TrainingArguments( | |
"test_trainer", | |
num_train_epochs=3, | |
per_device_train_batch_size=32, | |
per_device_eval_batch_size=32, | |
warmup_steps=500, | |
weight_decay=0.01, | |
) | |
# Model One: google-bert/bert-base-cased | |
model_one = AutoModelForMaskedLM.from_pretrained("google-bert/bert-base-cased") | |
trainer_one = Trainer( | |
model=model_one, | |
args=training_args, | |
train_dataset=tokenized_datasets_oneTrain, | |
eval_dataset=tokenized_datasets_oneTest, | |
data_collator=data_collator, | |
) | |
trainer_one.train() | |
# Get your API token from HuggingFace. | |
api_token = "redacted" | |
from transformers import BertConfig, BertModel | |
model_one.push_to_hub("emma7897/bert_one", token = api_token) | |
tokenizerOne.push_to_hub("emma7897/bert_one", token = api_token) | |
# Make the datasets | |
tokenized_datasets_twoTrain = datasetTrain.map(tokenize_function_two, batched=True) | |
tokenized_datasets_twoTest = datasetTest.map(tokenize_function_two, batched=True) | |
tokenized_datasets_twoTrain = tokenized_datasets_twoTrain["train"].select(range(10000)) | |
tokenized_datasets_twoTest = tokenized_datasets_twoTest["train"].select(range(2500)) | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizerTwo, mlm_probability=0.15) | |
training_args = TrainingArguments( | |
"test_trainer", | |
num_train_epochs=3, | |
per_device_train_batch_size=48, | |
per_device_eval_batch_size=48, | |
warmup_steps=500, | |
weight_decay=0.01, | |
) | |
# Model Two: distilbert/distilbert-base-cased | |
model_two = AutoModelForMaskedLM.from_pretrained("distilbert/distilbert-base-cased") | |
trainer_two = Trainer( | |
model=model_two, | |
args=training_args, | |
train_dataset=tokenized_datasets_twoTrain, | |
eval_dataset=tokenized_datasets_twoTest, | |
data_collator=data_collator, | |
) | |
trainer_two.train() | |
from transformers import DistilBertConfig, DistilBertModel | |
# Push my DistilBert model to the Hub. | |
model_two.push_to_hub("emma7897/distilbert_one", token=api_token) | |
tokenizerTwo.push_to_hub("emma7897/distilbert_one", token=api_token) |