from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments import torch from torch.utils.data import Dataset # from torch.optim import AdamW import pandas as pd from sklearn.model_selection import train_test_split # assignment 3 model_name = "distilbert-base-uncased" class ToxicDataset(Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item["labels"] = torch.tensor(self.labels[idx]) print(item) return item def __len__(self): return len(self.labels) print("Reading data...") data = pd.read_csv("./data/train.csv") toxic_data = pd.DataFrame() toxic_data["text"] = data["comment_text"] toxic_data["labels"] = data.iloc[:, 2:].values.tolist() print("Data read. Splitting data...") train_texts, val_texts, train_labels, val_labels = train_test_split(toxic_data.text.to_list(), toxic_data.labels.to_list(), test_size=.2) print("Data split. Tokenizing data...") tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True, return_tensors='pt') val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True, return_tensors='pt') train_dataset = ToxicDataset(train_encodings, train_labels) val_dataset = ToxicDataset(val_encodings, val_labels) print("Data tokenized. Beginning training...") training_args = TrainingArguments( output_dir="./results", num_train_epochs=2, per_device_train_batch_size=4, per_device_eval_batch_size=16, warmup_steps=500, weight_decay=0.01, logging_dir="./logs", logging_steps=10, ) # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, ) trainer.train() # model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=6) # model.to(device) # model.train() # train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True) # optim = AdamW(model.parameters(), lr=5e-5) # num_train_epochs = 2 # for epoch in range(num_train_epochs): # for batch in train_loader: # optim.zero_grad() # input_ids = batch["input_ids"].to(device) # attention_mask = batch["attention_mask"].to(device) # labels = batch["labels"].to(device) # outputs = model(input_ids, attention_mask=attention_mask, labels=labels) # loss = outputs[0] # loss.backward() # optim.step() # model.eval() print("Training complete. Saving model...") save_directory = ".results/model" model.save_pretrained(save_directory) print("Model saved.")