Spaces:
Sleeping
Sleeping
# import torch | |
# from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification | |
# # Same as before | |
# checkpoint = "bert-base-uncased" | |
# tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint) | |
# sequences = [ | |
# "I've been waiting for a HuggingFace course my whole life.", | |
# "This course is amazing!", | |
# ] | |
# batch = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt") | |
# # This is new | |
# batch["labels"] = torch.tensor([1, 1]) | |
# optimizer = AdamW(model.parameters()) | |
# loss = model(**batch).loss | |
# loss.backward() | |
# optimizer.step() | |
from datasets import load_dataset | |
# raw_datasets = load_dataset("glue", "sst2") | |
# raw_datasets | |
# raw_train_dataset = raw_datasets["train"] | |
# output = raw_train_dataset[0]['sentence'] | |
# print(output) | |
# raw_train_dataset = raw_datasets["validation"] | |
# output = raw_train_dataset[87] | |
# print(raw_train_dataset.features) | |
# from transformers import AutoTokenizer | |
# checkpoint = "bert-base-uncased" | |
# tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
# print(tokenizer(output)) | |
# inputs = tokenizer(output) | |
# print(tokenizer.convert_ids_to_tokens(inputs["input_ids"])) | |
# inputs = tokenizer("This is the first sentence.") | |
# print(inputs) | |
# print(tokenizer.convert_ids_to_tokens(inputs["input_ids"])) | |
# # tokenized_sentences_1 = tokenizer(raw_datasets["train"]["sentence1"]) | |
# # tokenized_sentences_2 = tokenizer(raw_datasets["train"]["sentence2"]) | |
# # inputs = tokenizer("This is the first sentence.", "This is the second one.") | |
# # inputs = tokenizer.convert_ids_to_tokens(inputs["input_ids"]) | |
# # print(inputs) | |
# def tokenize_function(example): | |
# return tokenizer(example["sentence"], truncation=True) | |
# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
# print(tokenized_datasets) | |
# from transformers import DataCollatorWithPadding | |
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# samples = tokenized_datasets["train"][:8] | |
# samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]} | |
# print([len(x) for x in samples["input_ids"]]) | |
# batch = data_collator(samples) | |
# print(batch) | |
# print({k: v.shape for k, v in batch.items()}) | |
# # Try it yourself | |
from datasets import load_dataset | |
raw_datasets = load_dataset("glue", "sst2") | |
raw_train_dataset = raw_datasets["train"] | |
output = raw_train_dataset[0]['sentence'] | |
# print(output) | |
from transformers import AutoTokenizer | |
checkpoint = "bert-base-uncased" | |
tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
# print(tokenizer(output)) | |
inputs = tokenizer(output) | |
# print(tokenizer.convert_ids_to_tokens(inputs["input_ids"])) | |
tokenized_dataset = tokenizer( | |
output, | |
padding=True, | |
truncation=True, | |
) | |
def tokenize_function(example): | |
return tokenizer(example["sentence"], truncation=True) | |
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
# print(tokenized_datasets) | |
# from datasets import load_dataset | |
# from transformers import AutoTokenizer, DataCollatorWithPadding | |
# raw_datasets = load_dataset("glue", "mrpc") | |
# checkpoint = "bert-base-uncased" | |
# tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
# def tokenize_function(example): | |
# return tokenizer(example["sentence1"], example["sentence2"], truncation=True) | |
# tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) | |
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
# from transformers import TrainingArguments | |
# training_args = TrainingArguments("test-trainer") | |
# from transformers import AutoModelForSequenceClassification | |
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | |
# from transformers import Trainer | |
# trainer = Trainer( | |
# model, | |
# training_args, | |
# train_dataset=tokenized_datasets["train"], | |
# eval_dataset=tokenized_datasets["validation"], | |
# data_collator=data_collator, | |
# tokenizer=tokenizer, | |
# ) | |
# predictions = trainer.predict(tokenized_datasets["validation"]) | |
# print(predictions.predictions.shape, predictions.label_ids.shape) | |
# import numpy as np | |
# preds = np.argmax(predictions.predictions, axis=-1) | |
# import evaluate | |
# metric = evaluate.load("glue", "mrpc") | |
# metric.compute(predictions=preds, references=predictions.label_ids) | |
# def compute_metrics(eval_preds): | |
# metric = evaluate.load("glue", "mrpc") | |
# logits, labels = eval_preds | |
# predictions = np.argmax(logits, axis=-1) | |
# return metric.compute(predictions=predictions, references=labels) | |
# training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch") | |
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | |
# trainer = Trainer( | |
# model, | |
# training_args, | |
# train_dataset=tokenized_datasets["train"], | |
# eval_dataset=tokenized_datasets["validation"], | |
# data_collator=data_collator, | |
# tokenizer=tokenizer, | |
# compute_metrics=compute_metrics, | |
# ) | |
# trainer.train() | |
from transformers import AutoTokenizer, DataCollatorWithPadding | |
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) | |
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"]) | |
tokenized_datasets = tokenized_datasets.rename_column("label", "labels") | |
tokenized_datasets.set_format("torch") | |
tokenized_datasets["train"].column_names | |
from torch.utils.data import DataLoader | |
train_dataloader = DataLoader( | |
tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator | |
) | |
eval_dataloader = DataLoader( | |
tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator | |
) | |
for batch in train_dataloader: | |
break | |
output = {k: v.shape for k, v in batch.items()} | |
# print(output) | |
from transformers import AutoModelForSequenceClassification | |
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) | |
outputs = model(**batch) | |
# print(outputs.loss, outputs.logits.shape) | |
from transformers import AdamW | |
optimizer = AdamW(model.parameters(), lr=5e-5) | |
from transformers import get_scheduler | |
num_epochs = 3 | |
num_training_steps = num_epochs * len(train_dataloader) | |
lr_scheduler = get_scheduler( | |
"linear", | |
optimizer=optimizer, | |
num_warmup_steps=0, | |
num_training_steps=num_training_steps, | |
) | |
print(num_training_steps) | |
# The training loop | |
import torch | |
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") | |
model.to(device) | |
# print(device) | |
from tqdm.auto import tqdm | |
progress_bar = tqdm(range(num_training_steps)) | |
model.train() | |
for epoch in range(num_epochs): | |
for batch in train_dataloader: | |
batch = {k: v.to(device) for k, v in batch.items()} | |
outputs = model(**batch) | |
loss = outputs.loss | |
loss.backward() | |
optimizer.step() | |
lr_scheduler.step() | |
optimizer.zero_grad() | |
progress_bar.update(1) | |
# The evaluation loop | |
import evaluate | |
metric = evaluate.load("glue", "mrpc") | |
model.eval() | |
for batch in eval_dataloader: | |
batch = {k: v.to(device) for k, v in batch.items()} | |
with torch.no_grad(): | |
outputs = model(**batch) | |
logits = outputs.logits | |
predictions = torch.argmax(logits, dim=-1) | |
metric.add_batch(predictions=predictions, references=batch["labels"]) | |
metric.compute() |