|
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
|
|
from datasets import load_dataset
|
|
|
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
|
model = GPT2LMHeadModel.from_pretrained("gpt2")
|
|
|
|
dataset = load_dataset('i2ebuddy/website_data', split='train')
|
|
dataset = dataset.map(lambda examples: tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512), batched=True)
|
|
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="./results",
|
|
evaluation_strategy="epoch",
|
|
learning_rate=2e-5,
|
|
per_device_train_batch_size=4,
|
|
weight_decay=0.01,
|
|
save_total_limit=3,
|
|
num_train_epochs=3,
|
|
report_to="none"
|
|
)
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=dataset
|
|
)
|
|
|
|
trainer.train()
|
|
|